Python processing a range of numbered (dated) files in a directory - python

I am trying to find a range of specific files in a directory using python 2.7.
I have many files in a directory that are named like AB_yyyyjjjhhmmss_001.txt, where y is year, j is julian date, h is hour and so on. Each time corresponds to the time some data was taken and not necessarily the time the file was created or manipulated. I like to pick out a range of time, say from 2013305010000 to 2013306123000 and process them.
I have something like,
import glob
def get_time (start_time = None, end_time = None):
if start_time == None:
start_time = input("start: ")
if end_time == None:
end_time = input("end: ")
duration = str(start_time) + "-" + str(end_time)
listing = glob.glob("*_[" + duration + "]_*")
I learned that [ ] are only meant to match single digit. So I am totally off track here. I also tried {start_time..end_time} combo with no avail.

If all files have the same structure, you can simply write:
import os
import re
start = sys.argv[1]
end = sys.argv[2]
for filename in os.listdir('test'):
if start <= filename.split('_')[1] <= end:
print "Process %s" % filename
Example:
$ ls test
AB_2013105010000_001.txt AB_2013305010000_001.txt AB_2013306103000_001.txt
AB_2013306123000_001.txt AB_2013316103000_001.txt
$ python t.py 2013305010000 2013306123000
Process AB_2013305010000_001.txt
Process AB_2013306103000_001.txt
Process AB_2013306123000_001.txt

I might try
import re
import os
import datetime
def filename_to_datetime(filename):
filedate = re.match(r'.*(\d{13}).*', filename)
if filedate:
return datetime.datetime.strptime(re.match(filedate.group(1), '%Y%j%H%M%S')
else:
raise ValueError("File has wrong format!")
def get_time(start_time, end_time):
return [filename for filename in os.listdir('.') if
start_time < filename_to_datetime(filename) < end_time]

Related

I want to loop My file creator a certain amount of times in python

Here is the code i am not sure what i have to do to loop it a certain amount of times
#import stuff
import os
import random
import string
#generate ran str
letters = string.ascii_lowercase
strgen = ( ''.join(random.choice(letters) for i in range(10)) )
#creating file and saving into dir
filepath = os.path.join('c:/files/' + strgen + '.txt')
if not os.path.exists('c:/files'):
os.makedirs('c:/files')
f = open(filepath, "w+")
not sure if I got it right but do you mean something like below?
count = 0
while count < 10:
letters = string.ascii_lowercase
strgen = ( ''.join(random.choice(letters) for i in range(10)) )
#creating file and saving into dir
filepath = os.path.join('c:/files/' + strgen + '.txt')
if not os.path.exists('c:/files'):
os.makedirs('c:/files')
f = open(filepath, "w+")
count = count + 1
Whatever action you want to perform place inside action function
start action after particular time you want to execute this function
stop interval for stop executing
No of times executing=Stop interval/set interval
import time, threading
StartTime=time.time()
def action() :
print('action ! -> time : {:.1f}s'.format(time.time()-StartTime))
class setInterval :
def __init__(self,interval,action) :
self.interval=interval
self.action=action
self.stopEvent=threading.Event()
thread=threading.Thread(target=self.__setInterval)
thread.start()
def __setInterval(self) :
nextTime=time.time()+self.interval
while not self.stopEvent.wait(nextTime-time.time()) :
nextTime+=self.interval
self.action()
def cancel(self) :
self.stopEvent.set()
# start action every 1s
inter=setInterval(0.6,action)
print('just after setInterval -> time : {:.1f}s'.format(time.time()-StartTime))
# will stop interval in 5s
t=threading.Timer(5,inter.cancel)
t.start()

How to delete files based on the creation time for each day in python

I have a system that generates 3 files per minute every day. The files before 4AM and after 10PM for that particular day are insignificant and they consume a lot of space. I want to get rid of them. The files are generated continuously and hence there is almost a month of data files stored in a subfolder for each day. How can I remove the data files that are insignificant by using python code?
My code is as follows:
from datetime import date, timedelta
def daterange(start_date, end_date):
for n in range(int ((end_date - start_date).days)):
yield start_date + timedelta(n)
start_date = datetime.datetime(2020, 3, 5,00,00)
end_date = datetime.datetime(2020, 3, 8, 23,59)
for single_date in daterange(start_date, end_date):
fpath = r"C:\Users\basantrp\Desktop\Data Trimming"
os.chdir(fpath)
for root, dirs, files in os.walk(fpath):
for f in files:
st=os.path.getmtime(fpath)
print(datetime.datetime.fromtimestamp(st))
if datetime.datetime.fromtimestamp(st) < (start_date + datetime.timedelta(0,18000)):
os.unlink(f)
But this doesn't seem to work. the output from
datetime.datetime.fromtimestamp(st) is 2020-03-19 00:16:10.550944
This is not a desired solution because this shows the time quite close to when the program was compiled or initiated.
# importing the required modules
import os
import shutil
import time
# main function
def main():
# initializing the count
deleted_folders_count = 0
deleted_files_count = 0
# specify the path
path = "/PATH_TO_DELETE"
# specify the days
days = 30
# converting days to seconds
# time.time() returns current time in seconds
seconds = time.time() - (days * 24 * 60 * 60)
# checking whether the file is present in path or not
if os.path.exists(path):
# iterating over each and every folder and file in the path
for root_folder, folders, files in os.walk(path):
# comparing the days
if seconds >= get_file_or_folder_age(root_folder):
# removing the folder
remove_folder(root_folder)
deleted_folders_count += 1 # incrementing count
# breaking after removing the root_folder
break
else:
# checking folder from the root_folder
for folder in folders:
# folder path
folder_path = os.path.join(root_folder, folder)
# comparing with the days
if seconds >= get_file_or_folder_age(folder_path):
# invoking the remove_folder function
remove_folder(folder_path)
deleted_folders_count += 1 # incrementing count
# checking the current directory files
for file in files:
# file path
file_path = os.path.join(root_folder, file)
# comparing the days
if seconds >= get_file_or_folder_age(file_path):
# invoking the remove_file function
remove_file(file_path)
deleted_files_count += 1 # incrementing count
else:
# if the path is not a directory
# comparing with the days
if seconds >= get_file_or_folder_age(path):
# invoking the file
remove_file(path)
deleted_files_count += 1 # incrementing count
else:
# file/folder is not found
print(f'"{path}" is not found')
deleted_files_count += 1 # incrementing count
print(f"Total folders deleted: {deleted_folders_count}")
print(f"Total files deleted: {deleted_files_count}")
def remove_folder(path):
# removing the folder
if not shutil.rmtree(path):
# success message
print(f"{path} is removed successfully")
else:
# failure message
print(f"Unable to delete the {path}")
def remove_file(path):
# removing the file
if not os.remove(path):
# success message
print(f"{path} is removed successfully")
else:
# failure message
print(f"Unable to delete the {path}")
def get_file_or_folder_age(path):
# getting ctime of the file/folder
# time will be in seconds
ctime = os.stat(path).st_ctime
# returning the time
return ctime
if __name__ == '__main__':
main()
You need to adjust the following two variables in the above code based on the requirement.
days = 30
path = "/PATH_TO_DELETE"

How to compare ctime properly?

I have a program that gets the modified date/time of directories and files. I then want to get the date/time from 30 seconds ago and compare that to the modified date/time.
If the modified time is less than 30 seconds ago, I want to trigger an alert. My code is triggering alert even if the modified time occurred more than 30 seconds ago.
Is there a way I can only trigger an alert if the modification occurred less than 30 seconds ago?
import os.path
import time, stat
import sys
share_dir = 'C:/mydir'
source_dir = r'' + share_dir + '/'
def trigger():
print("Triggered")
def check_dir():
while True:
for currentdir, dirs, files in os.walk(source_dir):
for file in files:
currentfile = os.path.join(currentdir, file)
# get modified time for files
ftime = os.stat(currentfile )[stat.ST_MTIME]
past = time.time() - 30 # last 30 seconds
if time.ctime(ftime) >= time.ctime(past):
print(time.ctime(ftime) + " > " + time.ctime(past))
print("Found modification in last 30 seconds for file =>", currentfile, time.ctime(ftime))
trigger()
sys.exit()
else:
print('No recent modifications.' + currentfile)
for folder in dirs:
currentfolder = os.path.join(currentdir, folder)
# get modified time for directories
dtime = os.stat(currentfolder)[stat.ST_MTIME]
past = time.time() - 30 # last 30 seconds
if time.ctime(dtime) >= time.ctime(past):
print(time.ctime(dtime) + " > " + time.ctime(past))
print("Found modification in last 30 seconds for folder =>", currentfolder, time.ctime(dtime))
trigger()
sys.exit()
else:
print('No recent modifications: ' + currentfolder)
time.sleep(4)
if __name__ == "__main__":
check_dir()
I'm doing this on a large scale file system. I personally use SQLite3 and round the mtime of the file (I had weird things happen using any other sort of operation and it was more consistent).
I'm also unsure why you're not just doing a pure math solution. Take the current time, take the mtime of the file, find the difference between them and if it's less than or equal to thirty, you get a hit.
I redid some of the code. I recommend trying this:
import os.path
import time, stat
import sys
def trigger():
print("Triggered")
def check_dir(source_dir):
for currentdir, dirs, files in os.walk(source_dir):
for file in files:
currentfile = os.path.join(currentdir, file)
# get modified time for files
ftime = os.path.getmtime(currentfile)
if time.time() - ftime <= 30:
print("Found modification in last 30 seconds for file =>", currentfile, time.ctime(ftime))
trigger()
exit(0)
else:
print('No recent modifications.' + currentfile)
for folder in dirs:
currentfolder = os.path.join(currentdir, folder)
# get modified time for directories
dtime = os.stat(currentfolder)[stat.ST_MTIME]
if time.time() - dtime <= 30:
print("Found modification in last 30 seconds for folder =>", currentfolder, time.ctime(dtime))
trigger()
exit(0)
else:
print('No recent modifications: ' + currentfolder)
if __name__ == "__main__":
check_dir('yourdirectoryhere')
Did some light testing on my own system and it seemed to work perfectly. Might want to add back the while loop but it should work.

Moving Files by creation/modification date then moving with Python

I am new to programming, even more so with Python. So please excuse any ignorance on my part. I am trying to write a script for myself that will move files that have been modified in the last 24 hours. So far I have came up with this:
import datetime
import os
import shutil
src = "C:\Users\Student\Desktop\FolderA"
dst = "C:\Users\Student\Desktop\FolderB"
now = dt.datetime.now()
before = now - dt.timedelta(hours=24)
def mins_since_mod(fname):
return (os.path.getmtime(fname))
for fname in os.listdir(src):
if mins_since_mod > before:
src_fname = os.path.join(src,fname)
os.path.join(dst,fname)
shutil.move(src_fname, dst)
I know i'm close to the solution, but I can't seem to figure out how to get this to work. I looked around here on the community and was not able to find a solution to my problem. Thank you for any leads or suggestions.
There are a few things to change. First, you can't compare the datetime in before to the Unix timestamp that getmtime() returns. It's easier to just use that directly. Also, you actually need to pass the (full) filename to mins_since_mod() for it to do anything.
Here's something that should work, changing the name of mins_since_mod() to reflect what it does better:
import time
import os
import shutil
SECONDS_IN_DAY = 24 * 60 * 60
src = "C:\Users\Student\Desktop\FolderA"
dst = "C:\Users\Student\Desktop\FolderB"
now = time.time()
before = now - SECONDS_IN_DAY
def last_mod_time(fname):
return os.path.getmtime(fname)
for fname in os.listdir(src):
src_fname = os.path.join(src, fname)
if last_mod_time(src_fname) > before:
dst_fname = os.path.join(dst, fname)
shutil.move(src_fname, dst_fname)
Hey mate I have actually just done something like this myself. I found that there will be a few issues will the time comparison as well as some issues in comparing and moving folders.
Try this:
import os
import shutil
import datetime
def filter_by_date(src_folder, archive_date):
os.chdir(src_folder)
delay_time = 24 * 60 * 60
archive_period = archive_date - delay_time
return [
name for name in os.listdir(u'.')
if os.path.isdir(name)
and datetime.datetime.fromtimestamp(os.path.getmtime(name)) < archive_period
]
if __name__ == '__main__':
folders = filter_by_date("C:/Users/Student/Desktop/FolderA", time.time())
for files in folders:
print files
try:
shutil.copytree(files, os.path.join("C:/Users/Student/Desktop/New", files))
except OSError as e:
print('\nDirectory not copied. Error: %s' % e)
except shutil.Error as e:
try:
files = files.encode('UTF-8')
dst_path = os.path.join('C:/Users/Student/Desktop/FolderB/', files)
shutil.copytree(files, dst_path)
finally:
print('\nDirectory not copied. Error: %s' % e)
print "\Completed"
This is going to ensure any file name (including Chinese, Russian and Japanese will be copied) and any folder (directory or sub-directory) is copied. It will also keep all file attributes.

Successive multiprocessing

I am filtering huge text files using multiprocessing.py. The code basically opens the text files, works on it, then closes it.
Thing is, I'd like to be able to launch it successively on multiple text files. Hence, I tried to add a loop, but for some reason it doesn't work (while the code works on each file). I believe this is an issue with:
if __name__ == '__main__':
However, I am looking for something else. I tried to create a Launcher and a LauncherCount files like this:
LauncherCount.py:
def setLauncherCount(n):
global LauncherCount
LauncherCount = n
and,
Launcher.py:
import os
import LauncherCount
LauncherCount.setLauncherCount(0)
os.system("OrientedFilterNoLoop.py")
LauncherCount.setLauncherCount(1)
os.system("OrientedFilterNoLoop.py")
...
I import LauncherCount.py, and use LauncherCount.LauncherCount as my loop index.
Of course, this doesn't work too as it edits the variable LauncherCount.LauncherCount locally, so it won't be edited in the imported version of LauncherCount.
Is there any way to edit globally a variable in an imported file? Or, is there any way to do this in any other way? What I need is running a code multiple times, in changing one value, and without using any loop apparently.
Thanks!
Edit: Here is my main code if necessary. Sorry for the bad style ...
import multiprocessing
import config
import time
import LauncherCount
class Filter:
""" Filtering methods """
def __init__(self):
print("launching methods")
# Return the list: [Latitude,Longitude] (elements are floating point numbers)
def LatLong(self,line):
comaCount = []
comaCount.append(line.find(','))
comaCount.append(line.find(',',comaCount[0] + 1))
comaCount.append(line.find(',',comaCount[1] + 1))
Lat = line[comaCount[0] + 1 : comaCount[1]]
Long = line[comaCount[1] + 1 : comaCount[2]]
try:
return [float(Lat) , float(Long)]
except ValueError:
return [0,0]
# Return a boolean:
# - True if the Lat/Long is within the Lat/Long rectangle defined by:
# tupleFilter = (minLat,maxLat,minLong,maxLong)
# - False if not
def LatLongFilter(self,LatLongList , tupleFilter) :
if tupleFilter[0] <= LatLongList[0] <= tupleFilter[1] and
tupleFilter[2] <= LatLongList[1] <= tupleFilter[3]:
return True
else:
return False
def writeLine(self,key,line):
filterDico[key][1].write(line)
def filteringProcess(dico):
myFilter = Filter()
while True:
try:
currentLine = readFile.readline()
except ValueError:
break
if len(currentLine) ==0: # Breaks at the end of the file
break
if len(currentLine) < 35: # Deletes wrong lines (too short)
continue
LatLongList = myFilter.LatLong(currentLine)
for key in dico:
if myFilter.LatLongFilter(LatLongList,dico[key][0]):
myFilter.writeLine(key,currentLine)
###########################################################################
# Main
###########################################################################
# Open read files:
readFile = open(config.readFileList[LauncherCount.LauncherCount][1], 'r')
# Generate writing files:
pathDico = {}
filterDico = config.filterDico
# Create outputs
for key in filterDico:
output_Name = config.readFileList[LauncherCount.LauncherCount][0][:-4]
+ '_' + key +'.log'
pathDico[output_Name] = config.writingFolder + output_Name
filterDico[key] = [filterDico[key],open(pathDico[output_Name],'w')]
p = []
CPUCount = multiprocessing.cpu_count()
CPURange = range(CPUCount)
startingTime = time.localtime()
if __name__ == '__main__':
### Create and start processes:
for i in CPURange:
p.append(multiprocessing.Process(target = filteringProcess ,
args = (filterDico,)))
p[i].start()
### Kill processes:
while True:
if [p[i].is_alive() for i in CPURange] == [False for i in CPURange]:
readFile.close()
for key in config.filterDico:
config.filterDico[key][1].close()
print(key,"is Done!")
endTime = time.localtime()
break
print("Process started at:",startingTime)
print("And ended at:",endTime)
To process groups of files in sequence while working on files within a group in parallel:
#!/usr/bin/env python
from multiprocessing import Pool
def work_on(args):
"""Process a single file."""
i, filename = args
print("working on %s" % (filename,))
return i
def files():
"""Generate input filenames to work on."""
#NOTE: you could read the file list from a file, get it using glob.glob, etc
yield "inputfile1"
yield "inputfile2"
def process_files(pool, filenames):
"""Process filenames using pool of processes.
Wait for results.
"""
for result in pool.imap_unordered(work_on, enumerate(filenames)):
#NOTE: in general the files won't be processed in the original order
print(result)
def main():
p = Pool()
# to do "successive" multiprocessing
for filenames in [files(), ['other', 'bunch', 'of', 'files']]:
process_files(p, filenames)
if __name__=="__main__":
main()
Each process_file() is called in sequence after the previous one has been complete i.e., the files from different calls to process_files() are not processed in parallel.

Categories

Resources