I'm trying to Pull the file from s3 based on id and date of the filename:
Naming Convention:
The naming convention are as follows:
**
ID_NAME_DATE.csv : filename follow that same pattern
example : 9919USEN_File_20180216.csv
example : 9919GBEN_File_20180211.csv
**
Code:
import boto3
import re
def downloadFiletest():
#connect to s3
client = boto3.resource(u's3', aws_access_key_id=u'KEY',
aws_secret_access_key=u'TOKEN')
#used for downloading
s3 = boto3.client(u's3', aws_access_key_id=u'KEY',
aws_secret_access_key=u'TOKEN')
dateIdReg = '[0-9]{8}'
dateSuffix = re.compile(date)
print (u"= S3 Client Connected =")
# configure s3 bucket
bucket = client.Bucket(u'us-eu-Bucket')
b_folder = "/folder/example/"
c_folder = b_folder.lower() + '/'
files_not_found = True
for cList in bucket.objects.filter(Prefix=b_folder):
cFiles= cList.key
print ('file : ', cFiles)
for fileId in cFiles.lower():
files_not_found = False
f = fileId.rstrip()
print(f)
fileidreg= '[0-9]{4}[a-zA-Z]{4}'
FileID = re.compile(fileidreg)
if FileID.match(f) and dateSuffix.match(f):
print(u'cList.key.lower(): ', cList.key.lower())
old_file = cList.key
dot_index = old_file.find(u'.')
print (u'old dot file name: ', dot_index)
file_ext = old_file[dot_index:]
cfile = fileId + '_file_' + dateSuffix + file_ext
tmp_path = "/tmp/folder/" + cfile
b_path = cVal + cfile
print (u'b path : ', b_path)
s3.download_file("us-eu-Bucket", b_path, tmp_path)
print ("TEMP PATH: ", tmp_path)
if files_not_found:
print("ALERT", "No file in {0}/{1}".format(bucket, b_folder))
downloadFiletest()
Error:
It Skips over for fileId in cFiles.lower(): and closes the script.
Goal:
Pull file from S3 and Download it to tmp_path to be used as desired.
When pulling file i'd like the script to pick file based on ID and Date. For instance:
Rule: Pseudo:
If S3 has file 9919USEN_File_20180216.csv and 9919USEN_File_20180217.csv then pick 9919USEN_File_20180217.csv to download. Also IF 991USEN_File_2018.csv in S3 then don't pick file as it doesn't match rule, fileidreg = '[0-9]{4}[a-zA-Z]{4}' and dateIdReg = '[0-9]{8}'.
Rule: Visual:
9919USEN_File_20180217.csv > 9919USEN_File_20180216.csv [due to date]
9919USEN_File_20180217.csv > 991USEN_File_2018.csv [Due to Incorrect ID and Date]
Solution
The issue was the way it was structured. I've reorganized and put it in side a try, exception conditional loop. I've also used FileIDPrefix.search instead of FileIDPrefix.match since it was only looking specifically looking at the index and wasn't proper for the question in hand.
final solution.
import boto3
import re
#connect to s3
client = boto3.resource(u's3', aws_access_key_id=u'KEY',
aws_secret_access_key=u'TOKEN')
#used for downloading
s3 = boto3.client(u's3', aws_access_key_id=u'KEY',
aws_secret_access_key=u'TOKEN')
def downloadFiletest():
date = '[0-9]{8}' # fileDate regex
dateSuffix = re.compile(dates) # regex used to check the date of the file
reg = '[0-9]{4}[a-zA-Z]{4}' # filename regex
fileIDPrefix = re.compile(reg) # check fileID of the Filename.
folder = u"/folder/example/" # directory
bucket = client.Bucket(bucketname) # bucket
try:
for cuList in bucket.objects.filter(Prefix=folder): # filter to the folder
filenames= cList.key # directory of the files that we would like to use
print(cu)
# specific locations of site fileID of the file and date of the file
fileID = filenames[33:41]
fileDate = filenames[51:59]
# check the length of each values to be verified later.
lenf = len(fileID)
lenG = len(fileDate)
old_file = cList.key
dot_index = old_file.find(u'.')
file_ext = old_file[dot_index:]
# this check that the files in directory match our specified rules. if does it proceeds.
if fileIDPrefix.search(cu) and fileDateSuffix.search(cu):
filename = fileID + u'_file_' + fileDate + file_ext
tmp_path = "/tmp/mpcmt/" + filename
file_path = folder + filename
s3.download_file(bucketname, file_path, tmp_path)
return filename, tmp_path, fileID, fileDate
# this check the number of values/char in a directory to see it matches up to what is expected.
if dot_index > 59 or dot_index < 59:
print('File has wrong fileID or Wrong Date')
if lenG > 8 or lenG < 8:
print('File has wrong fileDate Format')
if lenf > 8 or lenf < 8:
print('File has wrong fileID')
except Exception as e: # this closes and displays an error if the file doesn't exist.
print("ALERT", "No file in {0}/{1}".format(bucket, folder))
# There was some issue / error / problem and that is why the program is exiting.
print >> sys.stderr, "No file in {0}/{1}".format(bucket, folder)
print >> sys.stderr, "Exception: %s" % str(e)
sys.exit(1)
downloadFiletest()
Related
I am working in AWS cloud9 to download objects from an S3 bucket, concatenate and compress them, and upload back to S3.
I am currently using paginator to limit the number of files per page, but want to start the process again once the limit is reached to iterate through all the objects in the S3 location (eg. 100 objects in my bucket, download and process 10, upload back to S3, then repeat with the next 10 and the next 10 etc until all 100 objects have been processed and uploaded).
My current code just processes and uploads the first 10 files in the S3 location then stops. How do I put this process in a loop to achieve the above?
import boto3
import os
from datetime import datetime
import shutil
import gzip
# create the client object
client = boto3.client('s3')
session = boto3.Session()
s3r = session.resource('s3')
# bucket and folder urls
bucket= 'my-bucket'
Prefix = 'env/path'
separator = '/'
env, path = Prefix.split(separator)
dest_folder = 'dest'
# get date info for file name
curr_dt = datetime.now()
dt_str = curr_dt.strftime('%Y%m%d_%H%M%S%f')[:-3]
# initiate list of files to process
files_to_concat = []
paginator = client.get_paginator("list_objects_v2")
operation_parameters = {'Bucket': bucket,
'Prefix': Prefix + separator}
page_iterator = paginator.paginate(**operation_parameters, PaginationConfig={'MaxItems': 10})
for page in page_iterator:
print(page['Contents'])
for obj in page['Contents']:
key = obj['Key']
print(key)
tmp_dir = '/tmp/' + key[key.rindex('/')+1:]
if not os.path.exists('/tmp/'):
os.makedirs(tmp_dir)
elif not key.endswith('/'): #filters for files only
client.download_file(bucket, key, tmp_dir)
files_to_concat.append(tmp_dir)
print('Download successful')
else:
print('Cannot download a folder')
#concatenate the files
try:
with open('/tmp/' + dt_str + '_' + path, 'wt') as outfile:
for f in files_to_concat:
with open(f, encoding="utf-8", errors='ignore') as infile:
outfile.write(infile.read())
print('Successful concatenation of objects')
# gzip the concatenated file
try:
with open('/tmp/' + dt_str + '_' + path, 'rb') as gzinfile:
with gzip.open('/tmp/' + dt_str + '_' + path + '.gz', 'wb') as gzoutfile:
shutil.copyfileobj(gzinfile, gzoutfile)
print('Successful compression of file')
#delete original concatenated file
os.remove('/tmp/' + dt_str + '_' + path)
except Exception as e:
print(e)
print('Compression of object failed')
raise(e)
except Exception as e:
print(e)
print('Concatenation of objects failed')
raise(e)
s3r.meta.client.upload_file('/tmp/' + dt_str + '_' + path + '.gz', bucket, dest_folder + separator + path + separator + dt_str + '_' + path + '.gz')
print('Upload successful!')
Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 5 years ago.
Improve this question
AIM: I wanted to record all of the files on a variety of hard disk and collects the: file name, folders, and size of the file in megabytes. The code runs, to my knowledge doesn't produce any errors, but doesn't produce the csv file at the end?
What I've tried:
I've tried running the file with Sudo, changing the permissions with chmod +x, checking that python is in the same place for standard user and for sudo user, and lastly removing or commenting out troublesome lines which seem to yield different results or errors depending on OS.
import os
from os import path
import sys
import datetime
from datetime import date, time, timedelta
import time
import csv
#from socket import gethostbyname
#variables
#hostname = str(socket.gethostname())
scandir = "/"
savefiledir = "/Users/joshua/Documents/python/"
textfilename = str(datetime.datetime.now().strftime("%Y-%m-%d")) + "_" "directory_printer.csv"
#change directory to the root directory or the one which you want to scan for files (scandir)
os.getcwd()
os.chdir(scandir)
directory = os.getcwd()
#find all files in a directory and it's sub directory regardless of extension
results = [val for sublist in [[os.path.join(i[0], j) for j in i[2]] for i in os.walk(directory)] for val in sublist]
d = {}
file_count = 0
metadata = []
for file in results:
#full path
try:
fullpath = file
except:
fullpath = None
#file name
try:
file_directory = "/".join(str(file).split('/')[1:-1])
except:
file_directory = None
#file extension
try:
file_ext = str(file).split('/')[-1]
except:
file_ext = None
#subfolders
try:
parts = file_directory.split('/')
sub_folders = ":".join(parts[1:-1])
except:
sub_folders = None
#num subfolders
try:
count_subfolders = len(sub_folders.split(':'))
except:
count_subfolders = None
#filesize megabytes
try:
filesize_mb = os.path.getsize(file)/1024
except:
filesize_mb = None
#date modified
try:
date_modified = datetime.datetime.now() - datetime.datetime.fromtimestamp(path.getmtime(file))
except:
date_modified = None
#time modified
#try:
# time_modified = os.stat(fullpath).st_mtime #time of most recent content modification
#except:
# time_modified = None
#time created (windows)
# try:
# time_created = os.stat(fullpath).st_ctime #platform dependent; time of most recent metadata change on Unix, or the time of creation on Windows)# except:
# time_created = None
#record all file metadata
d[file_count] = {'Full_Path': fullpath, 'File_Directory': file_directory,
'File_Extension': file_ext, 'List_Sub_Folders' : sub_folders,
'Count_Sub_Folders' : count_subfolders, 'Filesize_mb' : filesize_mb,
'Date_Modified' : date_modified}
file_count = file_count + 1
#write the dictinary with the disks file metadata to a csv file
with open(textfilename,'w') as f:
w = csv.writer(f)
w.writerows(d.items())
print("Scanning directory: "
+ str(scandir) + " complete!" + "\n"
+ "The results have been saved to: " + "\n"
+ str(savefiledir)+str(textfilename))
As it is, it looks like your code will write the CSV file to scandir (/), not to savefiledir, because at the beginning of the program you call os.chdir(scandir). If you want to get the file at the right place (where the final printed message says it's saved to) you should do:
# ...
#write the dictinary with the disks file metadata to a csv file
with open(savefiledir + textfilename,'w') as f:
w = csv.writer(f)
w.writerows(d.items())
# ...
I wrote a Python script that collects file metadata (filename, creation date, creation time, last modified data, last modified time) from a file directory. However, when the directory is a path that is located in an external hard drive the script doesn't work. I can't figure out why.
Here is the code:
import os
from os.path import basename
import datetime
import time
def getSize(filename):
st = os.stat(filename)
print st
return st.st_size
#get last modified date
def getMTime(filename):
fileModTime = os.path.getmtime(filename)
return fileModTime
#get creation date
def getCTime(filename):
fileModTime = os.path.getctime(filename)
return fileModTime
#get data from directory
MyDirectory = "H:\0_tempfiles\150115_Portfolio\Work\Work\BarBackUp"
MyExtension = ".jpg"
#write to file
WorkingDirectory = "C:\\Users\Admin\Downloads\demo\\"
MyTxtFile = WorkingDirectory + "fileData6.txt"
delim = ";"
with open(MyTxtFile, 'wb') as f:
f.write(delim.join(["FILENAME", "FILESIZE", "mDATE","mTIME",
"cDATE","cTIME"]) + "\n")
for root, dirs, files in os.walk(MyDirectory):
for file in files:
if file.endswith(MyExtension):
#get File Name
a = (os.path.join(root, file))
#print a
filename = a
MyFileName = basename(a)
#get File Size
MyFileSize = getSize(filename) / 1000
print MyFileName + " >>> file size: " + str(MyFileSize) + "Kb"
#get modification time V2
modTimeV2 = getMTime(filename)
modTimeV2 = time.strftime("%Y/%d/%m;%I:%M:%S %p", \
time.localtime(modTimeV2))
print "time modified: " + str(modTimeV2)
#get creation time
creTime = getCTime(filename)
creTime = time.strftime("%Y/%d/%m;%I:%M:%S %p", \
time.localtime(creTime))
print "time created: " + str(creTime)
#--------
#write data to file
entry = delim.join([str(MyFileName), str(MyFileSize), \
str(modTimeV2), str(creTime)]) + "\n"
f.write(entry)
print "<<<<<<everything went fine>>>>>>"
Your code works fine for me. Your "MyDirectory" variable has escape characters in it. Try adding an r in front of the quotations:
MyDirectory = r"H:\0_tempfiles\150115_Portfolio\Work\Work\BarBackUp"
or
MyDirectory = "H:/0_tempfiles/150115_Portfolio/Work/Work/BarBackUp"
or
MyDirectory = "H:\\0_tempfiles\\150115_Portfolio\\Work\\Work\\BarBackUp"
I have a piece of code i wrote for school:
import os
source = "/home/pi/lab"
dest = os.environ["HOME"]
for file in os.listdir(source):
if file.endswith(".c")
shutil.move(file,dest+"/c")
elif file.endswith(".cpp")
shutil.move(file,dest+"/cpp")
elif file.endswith(".sh")
shutil.move(file,dest+"/sh")
what this code is doing is looking for files in a source directory and then if a certain extension is found the file is moved to that directory. This part works. If the file already exists in the destination folder of the same name add 1 at end of the file name, and before the extension and if they are multiples copies do "1++".
Like this: test1.c,test2.c, test3.c
I tried using os.isfile(filename) but this only looks at the source directory. and I get a true or false.
To test if the file exists in the destination folder you should os.path.join the dest folder with the file name
import os
import shutil
source = "/home/pi/lab"
dest = os.environ["HOME"]
# Avoid using the reserved word 'file' for a variable - renamed it to 'filename' instead
for filename in os.listdir(source):
# os.path.splitext does exactly what its name suggests - split the name and extension of the file including the '.'
name, extension = os.path.splitext(filename)
if extension == ".c":
dest_filename = os.path.join(dest, filename)
if not os.path.isfile(dest_filename):
# We copy the file as is
shutil.copy(os.path.join(source, filename) , dest)
else:
# We rename the file with a number in the name incrementing the number until we find one that is not used.
# This should be moved to a separate function to avoid code duplication when handling the different file extensions
i = 0
dest_filename = os.path.join(dest, "%s%d%s" % (name, i, extension))
while os.path.isfile(dest_filename):
i += 1
dest_filename = os.path.join(dest, "%s%d%s" % (name, i, extension))
shutil.copy(os.path.join(source, filename), dest_filename)
elif extension == ".cpp"
...
# Handle other extensions
If you want to have put the renaming logic in a separate function using glob and re this is one way:
import glob
import re
...
def rename_file(source_filename, source_ext):
filename_pattern = os.path.join(dest, "%s[0-9]*%s"
% (source_filename, source_ext))
# Contains file such as 'a1.c', 'a2.c', etc...
existing_files = glob.glob(filename_pattern)
regex = re.compile("%s([0-9]*)%s" % (source_filename, source_ext))
# Retrieve the max of the index used for this file using regex
max_index = max([int(match.group(1))
for match in map(regex.search, existing_files)
if match])
source_full_path = os.path.join(source, "%s%s"
% (source_filename, source_ext))
# Rebuild the destination filename with the max index + 1
dest_full_path = os.path.join(dest, "%s%d%s"
% (source_filename,
(max_index + 1),
source_ext))
shutil.copy(source_full_path, dest_full_path)
...
# If the file already exists i.e. replace the while loop in the else statement
rename_file(name, extension)
I din't test the code. But something like this should do the job:-
i = 0
filename = "a.txt"
while True:
if os.isfile(filename):
i+= 1
break
if i:
fname, ext = filename.split('.')
filename = fname + str(i) + '.' + ext
I'm trying to implement a file transfer automation with python 2.7 on Windows.
So I have a FTPS server, I need to move some files from it to a local directory and to upload some files from local to FTPS
The FTPS structure is like so:
- ROOT FOLDER
- AAA
- abc_id1
- in
- out
- abc_id2
- in
- out
- abc_id3
- in
- out
- BBB
- abc_id1
- in
- out
- abc_id2
- in
- out
- abc_id3
- in
- out
I must first MOVE all files that match a wildcard ABC_*.csv, they are located in all /in folders (so for ex. AAA\abc_id1\in) to a local directory
Then I must upload (COPY) some files that have a wildcard from the local directory to the corresponding abc_/in folder (for ex. a file named ABC_id3.csv must go to the abc_id3 folder)
I have began the code:
from ftplib import FTP_TLS
ftps = FTP_TLS('ip_address')
ftps.login("user", "pass") # login before securing control channel
ftps.prot_p() # switch to secure data connection
#ftps.retrlines('LIST') # list directory content securely
ftps.cwd("AAA")
ftps.retrlines('LIST')
ftps.quit()
But I don't know how can i loop through the multiple folders to accomplish the task
Please suggest some code
Regards
Two things that will help. Walking through directories with os.walk and generators.
You'll want to walk through the directories and check each file going through. Once you determine it's a file you want you can apply the appropriate FTP functionality.
Here's a sample I have from one of my apps I'm working on. I've added the ability to exclude as well.
# Generator which runs through directories and returns files
def scanDir (self, root, excludeDirs, excludeFiles, excludeExt, maxFileSize):
global fileList
print "Scanning directory " + root
x = 0
for root, dirnames, filenames in os.walk(root):
for name in filenames:
#We want absolute path to these
absroot = os.path.abspath(root)
filename = os.path.join(absroot, name)
fileSize = os.path.getsize(filename) / 1024
x = x + 1
#print x
##TODO compressed files call here (Extension)
if (os.path.isfile(filename) and os.path.getsize(filename) > 0):
if fileSize > maxFileSize:
continue
else:
try:
#print root + name
os.path.getsize(filename)
data = open(root + "/" + name, 'rb').read()
except:
data = False
print "Could not read file :: %s/%s" % (root, file)
# TODO Create Exception here and filter file paths:
# regex for /home/*/mail
self.fileList.append({"filename":filename})
yield data, filename
Here's an example of recursively walking an FTP server and fetching zip files, with an anonymous login.
#!/usr/bin/env python
from ftplib import FTP
from time import sleep
import os
ftp = FTP('ftp2.census.gov')
ftp.login()
my_dirs = [] # global
my_files = [] # global
curdir = '' # global
def get_dirs(ln):
global my_dirs
global my_files
cols = ln.split(' ')
objname = cols[len(cols)-1] # file or directory name
if ln.startswith('d'):
my_dirs.append(objname)
else:
if objname.endswith('.zip'):
my_files.append(os.path.join(curdir, objname)) # full path
def check_dir(adir):
global my_dirs
global my_files # let it accrue, then fetch them all later
global curdir
my_dirs = []
gotdirs = [] # local
curdir = ftp.pwd()
print("going to change to directory " + adir + " from " + curdir)
ftp.cwd(adir)
curdir = ftp.pwd()
print("now in directory: " + curdir)
ftp.retrlines('LIST', get_dirs)
gotdirs = my_dirs
print("found in " + adir + " directories:")
print(gotdirs)
print("Total files found so far: " + str(len(my_files)) + ".")
sleep(1)
for subdir in gotdirs:
my_dirs = []
check_dir(subdir) # recurse
ftp.cwd('..') # back up a directory when done here
try:
check_dir('/geo/tiger/GENZ2012') # root directory to start in
except:
print('oh dear.')
ftp.quit()
ftp.cwd('/.') # change to root directory for downloading
for f in my_files:
print('getting ' + f)
file_name = f.replace('/', '_') # use path as filename prefix, with underscores
ftp.retrbinary('RETR ' + f, open(file_name, 'wb').write)
sleep(1)
ftp.quit()
print('all done!')