How to move only new or updated files?

How to move only new or updated files? - python

I am trying to create a script that will move only new or updated files from the past 24 hours into a new folder. I created a script so far that will move files in general, any leads or suggestions would be greatly appreciated.
import os, shutil
source = os.listdir('C:\Users\Student\Desktop\FolderA')
destination = 'C:\Users\Student\Desktop\FolderB'
os.chdir('C:\Users\Student\Desktop\FolderA')
for files in os.listdir("C:\Users\Student\Desktop\FolderA"):
if files.endswith(".txt"):
src = os.path.join("C:\Users\Student\Desktop\FolderA",files)
dst = os.path.join(destination,files)
shutil.move(src,dst)

I believe I found a solution, let me know what you guys think.
# copy files from folder_a to folder_b
# if the files in folder_a have been modified within the past 24 hours
# copy them to folder_b
#
import shutil
import os
from os import path
import datetime
from datetime import date, time, timedelta
def file_has_changed(fname):
# print 'in file_has_changed with file : %s' % fname
# print str(path.getmtime(fname))
# get file modified time
file_m_time = datetime.datetime.fromtimestamp(path.getmtime(fname))
# print datetime.datetime.now()
# print file_m_time
#get the delta between today and filed mod time
td = datetime.datetime.now() - file_m_time
# print td
# print 'days : %d' % td.days
# file can be archived if mod within last 24 hours
if td.days == 0:
global ready_to_archive
ready_to_archive = ready_to_archive + 1
return True
else: return False
def main():
global ready_to_archive
global archived
ready_to_archive, archived = 0, 0
# src = "c:\users\gail\desktop\foldera"
# dst = "c:\users\gail\desktop\folderb"
for fname in os.listdir('c:\users\gail\Desktop\FolderA'):
src_fname = 'c:\users\gail\Desktop\FolderA\%s' % fname
if file_has_changed(src_fname):
dst_fname = 'c:\users\gail\Desktop\FolderB\%s' % fname
dst_folder = 'c:\users\gail\Desktop\FolderB'
try:
shutil.copy2(src_fname, dst_folder)
global archived;
archived = archived + 1
# print 'Copying file : %s ' % (src_fname)
# print ' To loc : %s ' % (dst_fname)
except IOError as e:
print 'could not open the file: %s ' % e
if __name__ == "__main__":
main()
print '****** Archive Report for %s ******' % datetime.datetime.now()
print '%d files ready for archiving ' % ready_to_archive
print '%d files archived' % archived
print '****** End of Archive Report ******'

Related

Does Python have an internal limit for recursive loops? [duplicate]

So I understand the reason for the recursion limit of 1000. I want to run a script continuously, but am I right understanding that eventually the recursion limit will be reached (even if I set it higher) and Python will break?
In the scheme of things, its not a big deal, because I could get the OS to keep re-starting the script, but I thought there may be a more elegant solution I can employ within the script itself (swapping threads??).
My script:
import os
import subprocess
import time
import logging
import datetime
from sys import argv
if len(argv) < 3:
exit('Please provide two arguments - Source Destination')
LOC_DIR = argv[1]
REM_DIR = argv[2]
POLL_INT = 10
RUN_INT = 60
FILE_EXT = '.mov'
# logging setup
logging.basicConfig(filename='%s' % os.path.join(LOC_DIR, '%s the_log.log' % datetime.datetime.now()),level=logging.DEBUG)
# make an easy print and logging function
def printLog(string):
print '%s %s' % (datetime.datetime.now(), string)
logging.info('%s %s' % (datetime.datetime.now(), string))
# get the files with absolute paths
def getFiles(path):
return [os.path.join(path, entry) for entry in os.listdir(path)]
# check if file is still being copied (file size has changed within the poll interval)
def checkSize(path):
same = False
while same is False:
printLog("Processing '%s'" % os.path.basename(path))
printLog('Waiting %s seconds for any filesize change' % POLL_INT)
size1 = os.path.getsize(path)
time.sleep(POLL_INT)
size2 = os.path.getsize(path)
if size1 == size2:
same = True
printLog('File size stayed the same for %s seconds' % POLL_INT)
return same
else:
printLog('File size change detected. Waiting a further %s seconds' % POLL_INT)
# check if correct file extension
def checkExt(path):
if path.endswith(FILE_EXT):
return True
# rsync subprocess
def rsyncFile(path):
printLog("Syncing file '%s'" % os.path.basename(path))
try:
command = ['rsync', '-a', '--remove-source-files', path, REM_DIR]
p = subprocess.Popen(command, stdout=subprocess.PIPE)
for line in p.stdout:
printLog("rsync: '%s'" %line)
p.wait()
if p.returncode == 0:
printLog('<<< File synced successfully :) >>>')
elif p.returncode == 10:
printLog('****** Please check your internet connection!! ****** Rsync error code: %s' % p.returncode)
else:
printLog('There was a problem. Error code: %s' % p.returncode)
except Exception as e:
logging.debug(e)
# main logic
def main():
all_files = getFiles(LOC_DIR)
files = []
for f in all_files:
if checkExt(f):
files.append(f)
if len(files) == 1:
printLog('<<< Found %s matching file >>>' % len(files))
elif len(files) > 1:
printLog('<<< Found %s matching files >>>' % len(files))
for f in files:
if checkSize(f):
rsyncFile(f)
printLog('No files found. Checking again in %s seconds' % RUN_INT)
time.sleep(RUN_INT)
printLog('Checking for files')
main()
if __name__ == "__main__":
main()

CPython has no optimizations for recursion, so you really want to avoid deeply-recursive code in favor of regular loops:
def main():
while True:
all_files = getFiles(LOC_DIR)
files = []
for f in all_files:
if checkExt(f):
files.append(f)
if len(files) == 1:
printLog('<<< Found %s matching file >>>' % len(files))
elif len(files) > 1:
printLog('<<< Found %s matching files >>>' % len(files))
for f in files:
if checkSize(f):
rsyncFile(f)
printLog('No files found. Checking again in %s seconds' % RUN_INT)
time.sleep(RUN_INT)
printLog('Checking for files')
if __name__ == "__main__":
main()

You're going about this in the wrong way.
Replace the main loop with a loop.
# main logic
def main():
while True:
all_files = getFiles(LOC_DIR)
files = []
for f in all_files:
if checkExt(f):
files.append(f)
if len(files) == 1:
printLog('<<< Found %s matching file >>>' % len(files))
elif len(files) > 1:
printLog('<<< Found %s matching files >>>' % len(files))
for f in files:
if checkSize(f):
rsyncFile(f)
printLog('No files found. Checking again in %s seconds' % RUN_INT)
time.sleep(RUN_INT)
printLog('Checking for files')

The recursion limit is only set with recursive functions from my understanding, so If you really want to run something repeatedly, you can simply run.
while True:
#repeated stuff goes here
Recursion is an amazing tool, but handle with care, it often can end up burning you. You were right in the fact that python can only go 1000 calls deep recursively, so if you recursive method doesn't finish by then the exception gets thrown.
Goodluck.

How to get file metadata from external drive in Python?

I wrote a Python script that collects file metadata (filename, creation date, creation time, last modified data, last modified time) from a file directory. However, when the directory is a path that is located in an external hard drive the script doesn't work. I can't figure out why.
Here is the code:
import os
from os.path import basename
import datetime
import time
def getSize(filename):
st = os.stat(filename)
print st
return st.st_size
#get last modified date
def getMTime(filename):
fileModTime = os.path.getmtime(filename)
return fileModTime
#get creation date
def getCTime(filename):
fileModTime = os.path.getctime(filename)
return fileModTime
#get data from directory
MyDirectory = "H:\0_tempfiles\150115_Portfolio\Work\Work\BarBackUp"
MyExtension = ".jpg"
#write to file
WorkingDirectory = "C:\\Users\Admin\Downloads\demo\\"
MyTxtFile = WorkingDirectory + "fileData6.txt"
delim = ";"
with open(MyTxtFile, 'wb') as f:
f.write(delim.join(["FILENAME", "FILESIZE", "mDATE","mTIME",
"cDATE","cTIME"]) + "\n")
for root, dirs, files in os.walk(MyDirectory):
for file in files:
if file.endswith(MyExtension):
#get File Name
a = (os.path.join(root, file))
#print a
filename = a
MyFileName = basename(a)
#get File Size
MyFileSize = getSize(filename) / 1000
print MyFileName + " >>> file size: " + str(MyFileSize) + "Kb"
#get modification time V2
modTimeV2 = getMTime(filename)
modTimeV2 = time.strftime("%Y/%d/%m;%I:%M:%S %p", \
time.localtime(modTimeV2))
print "time modified: " + str(modTimeV2)
#get creation time
creTime = getCTime(filename)
creTime = time.strftime("%Y/%d/%m;%I:%M:%S %p", \
time.localtime(creTime))
print "time created: " + str(creTime)
#--------
#write data to file
entry = delim.join([str(MyFileName), str(MyFileSize), \
str(modTimeV2), str(creTime)]) + "\n"
f.write(entry)
print "<<<<<<everything went fine>>>>>>"

Your code works fine for me. Your "MyDirectory" variable has escape characters in it. Try adding an r in front of the quotations:
MyDirectory = r"H:\0_tempfiles\150115_Portfolio\Work\Work\BarBackUp"
or
MyDirectory = "H:/0_tempfiles/150115_Portfolio/Work/Work/BarBackUp"
or
MyDirectory = "H:\\0_tempfiles\\150115_Portfolio\\Work\\Work\\BarBackUp"

python - archive file without full path

I have the following script -
import os
import stat
import zipfile
from datetime import datetime, timedelta
import logging
logfile = 'D:\\logfiles\\MasterLogsArchive\\archive.log'
logging.basicConfig(filename=logfile, format='%(asctime)s %(message)s', level=logging.DEBUG)
try:
import zlib
compression = zipfile.ZIP_DEFLATED
except:
compression = zipfile.ZIP_STORED
modes = { zipfile.ZIP_DEFLATED: 'deflated',
zipfile.ZIP_STORED: 'stored',
}
def modified_date(filename):
return datetime.fromtimestamp(os.stat(filename)[stat.ST_MTIME])
def find_between( s, first, last ):
try:
start = s.index( first ) + len( first )
end = s.index( last, start )
return s[start:end]
except ValueError:
return ""
move_date = datetime.now() - timedelta(minutes = 2)
src = "D:\\program files (x86)\\TIDAL\\Scheduler\\Master\\log"
for filename in os.listdir(src):
full_filename = os.path.join(src, filename)
scheduler = os.path.join(src, 'scheduler.out')
if modified_date(full_filename) < move_date and filename.startswith('Master'):
filedate = find_between(filename, '-', '.')[:-7]
date = filedate[:-2]
year = filedate[:-6]
month = filedate[4:-4]
day = filedate[6:-2]
hour = filedate[8:]
dest = "D:\\logfiles\\MasterLogsArchive\\" + date
if not os.path.exists(dest):
os.makedirs(dest)
zf = dest + '\\' + 'Master%s%s%s-%s.zip' % (year, month, day, hour)
## add Master Logs
if (os.path.isfile(full_filename)):
if (os.path.isfile(zf)):
try:
logging.info('%s is archived' % full_filename)
zip = zipfile.ZipFile(zf, mode='a')
zip.write(full_filename, compress_type=compression)
os.remove(full_filename)
finally:
zip.close()
else:
try:
logging.info('%s is archived' % full_filename)
zip = zipfile.ZipFile(dest + '\\' + 'Master%s%s%s-%s.zip' % (year, month, day, hour), mode='w')
zip.write(full_filename, compress_type=compression)
os.remove(full_filename)
finally:
zip.close()
The problem I'm having is that in compression it's doing the full path which I don't want. I only want the file in the zip. If I change the zip.write to do 'filename' instead of 'full_filename' it then complains it can't find the file.
So how do I get the write to know what folder to grab the file out of?

The actual write needs to be changed to the following -
zip.write(full_filename, os.path.basename(full_filename), compress_type=compression)
Got the answer from here -
How can I zip file with a flattened directory structure using Zipfile in Python?

Python FTP get the most recent file by date

I am using ftplib to connect to an ftp site. I want to get the most recently uploaded file and download it. I am able to connect to the ftp server and list the files, I also have put them in a list and got the datefield converted. Is there any function/module which can get the recent date and output the whole line from the list?
#!/usr/bin/env python
import ftplib
import os
import socket
import sys
HOST = 'test'
def main():
try:
f = ftplib.FTP(HOST)
except (socket.error, socket.gaierror), e:
print 'cannot reach to %s' % HOST
return
print "Connect to ftp server"
try:
f.login('anonymous','al#ge.com')
except ftplib.error_perm:
print 'cannot login anonymously'
f.quit()
return
print "logged on to the ftp server"
data = []
f.dir(data.append)
for line in data:
datestr = ' '.join(line.split()[0:2])
orig-date = time.strptime(datestr, '%d-%m-%y %H:%M%p')
f.quit()
return
if __name__ == '__main__':
main()
RESOLVED:
data = []
f.dir(data.append)
datelist = []
filelist = []
for line in data:
col = line.split()
datestr = ' '.join(line.split()[0:2])
date = time.strptime(datestr, '%m-%d-%y %H:%M%p')
datelist.append(date)
filelist.append(col[3])
combo = zip(datelist,filelist)
who = dict(combo)
for key in sorted(who.iterkeys(), reverse=True):
print "%s: %s" % (key,who[key])
filename = who[key]
print "file to download is %s" % filename
try:
f.retrbinary('RETR %s' % filename, open(filename, 'wb').write)
except ftplib.err_perm:
print "Error: cannot read file %s" % filename
os.unlink(filename)
else:
print "***Downloaded*** %s " % filename
return
f.quit()
return
One problem, is it possible to retrieve the first element from the dictionary? what I did here is that the for loop runs only once and exits thereby giving me the first sorted value which is fine, but I don't think it is a good practice to do it in this way..

For those looking for a full solution for finding the latest file in a folder:
MLSD
If your FTP server supports MLSD command, a solution is easy:
entries = list(ftp.mlsd())
entries.sort(key = lambda entry: entry[1]['modify'], reverse = True)
latest_name = entries[0][0]
print(latest_name)
LIST
If you need to rely on an obsolete LIST command, you have to parse a proprietary listing it returns.
Common *nix listing is like:
-rw-r--r-- 1 user group 4467 Mar 27 2018 file1.zip
-rw-r--r-- 1 user group 124529 Jun 18 15:31 file2.zip
With a listing like this, this code will do:
from dateutil import parser
# ...
lines = []
ftp.dir("", lines.append)
latest_time = None
latest_name = None
for line in lines:
tokens = line.split(maxsplit = 9)
time_str = tokens[5] + " " + tokens[6] + " " + tokens[7]
time = parser.parse(time_str)
if (latest_time is None) or (time > latest_time):
latest_name = tokens[8]
latest_time = time
print(latest_name)
This is a rather fragile approach.
MDTM
A more reliable, but a way less efficient, is to use MDTM command to retrieve timestamps of individual files/folders:
names = ftp.nlst()
latest_time = None
latest_name = None
for name in names:
time = ftp.voidcmd("MDTM " + name)
if (latest_time is None) or (time > latest_time):
latest_name = name
latest_time = time
print(latest_name)
For an alternative version of the code, see the answer by #Paulo.
Non-standard -t switch
Some FTP servers support a proprietary non-standard -t switch for NLST (or LIST) command.
lines = ftp.nlst("-t")
latest_name = lines[-1]
See How to get files in FTP folder sorted by modification time.
Downloading found file
No matter what approach you use, once you have the latest_name, you download it as any other file:
with open(latest_name, 'wb') as f:
ftp.retrbinary('RETR '+ latest_name, f.write)
See also
Get the latest FTP folder name in Python
How to get FTP file's modify time using Python ftplib

Why don't you use next dir option?
ftp.dir('-t',data.append)
With this option the file listing is time ordered from newest to oldest. Then just retrieve the first file in the list to download it.

With NLST, like shown in Martin Prikryl's response,
you should use sorted method:
ftp = FTP(host="127.0.0.1", user="u",passwd="p")
ftp.cwd("/data")
file_name = sorted(ftp.nlst(), key=lambda x: ftp.voidcmd(f"MDTM {x}"))[-1]

If you have all the dates in time.struct_time (strptime will give you this) in a list then all you have to do is sort the list.
Here's an example :
#!/usr/bin/python
import time
dates = [
"Jan 16 18:35 2012",
"Aug 16 21:14 2012",
"Dec 05 22:27 2012",
"Jan 22 19:42 2012",
"Jan 24 00:49 2012",
"Dec 15 22:41 2012",
"Dec 13 01:41 2012",
"Dec 24 01:23 2012",
"Jan 21 00:35 2012",
"Jan 16 18:35 2012",
]
def main():
datelist = []
for date in dates:
date = time.strptime(date, '%b %d %H:%M %Y')
datelist.append(date)
print datelist
datelist.sort()
print datelist
if __name__ == '__main__':
main()

I don't know how it's your ftp, but your example was not working for me. I changed some lines related to the date sorting part:
import sys
from ftplib import FTP
import os
import socket
import time
# Connects to the ftp
ftp = FTP(ftpHost)
ftp.login(yourUserName,yourPassword)
data = []
datelist = []
filelist = []
ftp.dir(data.append)
for line in data:
col = line.split()
datestr = ' '.join(line.split()[5:8])
date = time.strptime(datestr, '%b %d %H:%M')
datelist.append(date)
filelist.append(col[8])
combo = zip(datelist,filelist)
who = dict(combo)
for key in sorted(who.iterkeys(), reverse=True):
print "%s: %s" % (key,who[key])
filename = who[key]
print "file to download is %s" % filename
try:
ftp.retrbinary('RETR %s' % filename, open(filename, 'wb').write)
except ftplib.err_perm:
print "Error: cannot read file %s" % filename
os.unlink(filename)
else:
print "***Downloaded*** %s " % filename
ftp.quit()

Python recursion limit

So I understand the reason for the recursion limit of 1000. I want to run a script continuously, but am I right understanding that eventually the recursion limit will be reached (even if I set it higher) and Python will break?
In the scheme of things, its not a big deal, because I could get the OS to keep re-starting the script, but I thought there may be a more elegant solution I can employ within the script itself (swapping threads??).
My script:
import os
import subprocess
import time
import logging
import datetime
from sys import argv
if len(argv) < 3:
exit('Please provide two arguments - Source Destination')
LOC_DIR = argv[1]
REM_DIR = argv[2]
POLL_INT = 10
RUN_INT = 60
FILE_EXT = '.mov'
# logging setup
logging.basicConfig(filename='%s' % os.path.join(LOC_DIR, '%s the_log.log' % datetime.datetime.now()),level=logging.DEBUG)
# make an easy print and logging function
def printLog(string):
print '%s %s' % (datetime.datetime.now(), string)
logging.info('%s %s' % (datetime.datetime.now(), string))
# get the files with absolute paths
def getFiles(path):
return [os.path.join(path, entry) for entry in os.listdir(path)]
# check if file is still being copied (file size has changed within the poll interval)
def checkSize(path):
same = False
while same is False:
printLog("Processing '%s'" % os.path.basename(path))
printLog('Waiting %s seconds for any filesize change' % POLL_INT)
size1 = os.path.getsize(path)
time.sleep(POLL_INT)
size2 = os.path.getsize(path)
if size1 == size2:
same = True
printLog('File size stayed the same for %s seconds' % POLL_INT)
return same
else:
printLog('File size change detected. Waiting a further %s seconds' % POLL_INT)
# check if correct file extension
def checkExt(path):
if path.endswith(FILE_EXT):
return True
# rsync subprocess
def rsyncFile(path):
printLog("Syncing file '%s'" % os.path.basename(path))
try:
command = ['rsync', '-a', '--remove-source-files', path, REM_DIR]
p = subprocess.Popen(command, stdout=subprocess.PIPE)
for line in p.stdout:
printLog("rsync: '%s'" %line)
p.wait()
if p.returncode == 0:
printLog('<<< File synced successfully :) >>>')
elif p.returncode == 10:
printLog('****** Please check your internet connection!! ****** Rsync error code: %s' % p.returncode)
else:
printLog('There was a problem. Error code: %s' % p.returncode)
except Exception as e:
logging.debug(e)
# main logic
def main():
all_files = getFiles(LOC_DIR)
files = []
for f in all_files:
if checkExt(f):
files.append(f)
if len(files) == 1:
printLog('<<< Found %s matching file >>>' % len(files))
elif len(files) > 1:
printLog('<<< Found %s matching files >>>' % len(files))
for f in files:
if checkSize(f):
rsyncFile(f)
printLog('No files found. Checking again in %s seconds' % RUN_INT)
time.sleep(RUN_INT)
printLog('Checking for files')
main()
if __name__ == "__main__":
main()

CPython has no optimizations for recursion, so you really want to avoid deeply-recursive code in favor of regular loops:
def main():
while True:
all_files = getFiles(LOC_DIR)
files = []
for f in all_files:
if checkExt(f):
files.append(f)
if len(files) == 1:
printLog('<<< Found %s matching file >>>' % len(files))
elif len(files) > 1:
printLog('<<< Found %s matching files >>>' % len(files))
for f in files:
if checkSize(f):
rsyncFile(f)
printLog('No files found. Checking again in %s seconds' % RUN_INT)
time.sleep(RUN_INT)
printLog('Checking for files')
if __name__ == "__main__":
main()

You're going about this in the wrong way.
Replace the main loop with a loop.
# main logic
def main():
while True:
all_files = getFiles(LOC_DIR)
files = []
for f in all_files:
if checkExt(f):
files.append(f)
if len(files) == 1:
printLog('<<< Found %s matching file >>>' % len(files))
elif len(files) > 1:
printLog('<<< Found %s matching files >>>' % len(files))
for f in files:
if checkSize(f):
rsyncFile(f)
printLog('No files found. Checking again in %s seconds' % RUN_INT)
time.sleep(RUN_INT)
printLog('Checking for files')

The recursion limit is only set with recursive functions from my understanding, so If you really want to run something repeatedly, you can simply run.
while True:
#repeated stuff goes here
Recursion is an amazing tool, but handle with care, it often can end up burning you. You were right in the fact that python can only go 1000 calls deep recursively, so if you recursive method doesn't finish by then the exception gets thrown.
Goodluck.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to move only new or updated files? - python

Related

Does Python have an internal limit for recursive loops? [duplicate]

How to get file metadata from external drive in Python?

python - archive file without full path

Python FTP get the most recent file by date

Python recursion limit

Categories

Resources