AWS Lambda loop through files using boto3

AWS Lambda loop through files using boto3 - python

I have the following lambda function that will search my my s3 bucket with the prefix being the current time in milliseconds. I have about 600-800k files per hour that I would like to do some manipulation to. This code works as intended but takes forever to scan the prefix. I have a feeling that this part of my code is not efficient. Since this lambda function is scheduled to run every 10 mins I have my min range set to go back up to 11 mins in milliseconds. I would greatly appreciate if someone could help me make this piece more efficient if possible.
import os
import boto3
import json
import tempfile
import re
from datetime import date, datetime,timezone
import _datetime
import time
def lambda_handler(event, context):
# TODO implement
s3_client = boto3.client("s3")
s3_resource = boto3.resource('s3')
paginator = s3_client.get_paginator('list_objects_v2')
keys = []
result = []
now = int(round(time.time() * 1000))
now_min = now - 660000 # 11 mins
times = list(range(now_min,now+1))
for t in times:
prefix = 'Uploads/' + str(datetime.now(timezone.utc).strftime("%Y-%m-%d")) + '/' + str(datetime.utcnow().strftime('%H')) + '/' + str(t)
pages = paginator.paginate(Bucket='bucket', Prefix=prefix)
for page in pages:
if page.get('KeyCount') != 0:
for obj in page['Contents']:
keys.append(obj['Key'])
for key in keys[1:]:
The goal is take these 800k files and condense them into multiple larger files instead of having 800k small files.
for key in keys[1:]:
local_filepath = os.path.join(tempfile.gettempdir(), key)
regex_local_filepath = '/tmp/' + re.search('([^\/]+$)', local_filepath).group(0)
re_key = re.search('([^-/]+$)', key).group(0)
re_key = re_key.replace('.json','')
s3_resource.Bucket('bucket').download_file(key,regex_local_filepath)
with open (regex_local_filepath,'r') as infile:
result.append(json.load(infile))
file_name = 'Uploads/' + str(datetime.now(timezone.utc).strftime("%Y-%m-%d")) + '/' + str(datetime.utcnow().strftime('%H')) + '/' + str(now) + '.json'
s3object = s3_resource.Object('new-bucket', file_name)
s3object.put(
Body=(bytes(json.dumps(result, indent=2, sort_keys=True).encode('UTF-8')))
)
return None

I have figured out the correct way to efficiently loop through. It seems I was looping through multiple times and appending times to the keys.
If one needs to condense s3 files into larger single files. This approach works amazingly well. Cheers!
import os
import boto3
import json
import tempfile
import re
from datetime import date, datetime,timezone
import _datetime
import time
def lambda_handler(event, context):
# TODO implement
s3_client = boto3.client("s3")
s3_resource = boto3.resource('s3')
paginator = s3_client.get_paginator('list_objects_v2')
now = int(round(time.time() * 1000))
min_now = now - 360000 # Go back 6 mins since lambda function runs every 5 mins
max_now = now + 60000 # This is to handle minute 59 after the hour.
keys = []
regex_keys = []
result = []
content_keys = []
my_bucket = s3_resource.Bucket('bucket')
prefix = 'Uploads/'
key_objects = iter(my_bucket.objects.filter(Prefix=prefix))
next(key_objects)
for object_summary in key_objects:
obj_key = object_summary.key # This gives me all the keys in the above prefix
keys.append(obj_key)
for key in keys:
regex_key = re.search('\/(.*?)\-', key).group(0).replace('/','').replace('-','') # I just want the timestamp (miliseconds)
regex_keys.append(regex_key)
for regex_key in regex_keys:
if min_now <= int(regex_key) <= max_now:
prefix = 'Uploads/' + str(regex_key)
pages = paginator.paginate(Bucket='bucket', Prefix=prefix)
for page in pages:
for obj in page['Contents']:
content_keys.append(obj['Key'])
print(len(content_keys))
return None

Related

Threading NNTP, how? (Newbie here)

I can't wrap my head around how I could possibly rewrite my code to be multi-threaded.
The code I'm writing is made to automatically archive every single article in a list of newsgroups that exist, but I wanna be able to utilize my newsgroup plan and make it up to 20 threads. I've never coded threading before and my attempts were in vein.
Here's my code, excluding the username and pass ( but you can get a free account with max 5 threads if you really want to at https://my.xsusenet.com )
Please don't judge me too hard :(
import nntplib
import sys
import datetime
import os
basetime = datetime.datetime.today()
#daysback = int(sys.argv[1])
#date_list = [basetime - datetime.timedelta(days=x) for x in range(daysback)]
s = nntplib.NNTP('free.xsusenet.com', user='USERNAME', password='PASSWORD') # I am only allowed 5 connections at a time, so try for 4.
groups = []
resp, groups_list_tuple = s.list()
def remove_non_ascii_2(string):
return string.encode('ascii', errors='ignore').decode()
for g_tuple in groups_list_tuple:
#print(g_tuple) # DEBUG_LINE
# Parse group_list info
group = g_tuple[0]
last = g_tuple[1]
first = g_tuple[2]
flag = g_tuple[3]
# Parse newsgroup info
resp, count, first, last, name = s.group(group)
for message_id in range(first, last):
resp, number, mes_id = s.next()
resp, info = s.article(mes_id)
if os.path.exists('.\\' + group):
pass
else:
os.mkdir('.\\' + group)
print(f"Downloading: {message_id}")
outfile = open('.\\' + group + '\\' + str(message_id), 'a', encoding="utf-8")
for line in info.lines:
outfile.write(remove_non_ascii_2(str(line)) + '\n')
outfile.close()
Tried threading using a ThreadPoolExecutor, to cause it to use 20 threads, and failed, caused it to repeat the same process to the same message id. The expected result was to download 20 different messages at a time.
Here's the code I tried with threading, mind you I did like 6-8 variations of it to try and get it to work, this was the last one before I gave up to ask on here.
import nntplib
import sys
import datetime
import os
import concurrent.futures
basetime = datetime.datetime.today()
#daysback = int(sys.argv[1])
#date_list = [basetime - datetime.timedelta(days=x) for x in range(daysback)]
s = nntplib.NNTP('free.xsusenet.com', user='USERNAME', password='PASSWORD') # I am only allowed 5 connections at a time, so try for 4.
groups = []
resp, groups_list_tuple = s.list()
def remove_non_ascii_2(string):
return string.encode('ascii', errors='ignore').decode()
def download_nntp_file(mess_id):
resp, count, first, last, name = s.group(group)
message_id = range(first, last)
resp, number, mes_id = s.next()
resp, info = s.article(mes_id)
if os.path.exists('.\\' + group):
pass
else:
os.mkdir('.\\' + group)
print(f"Downloading: {mess_id}")
outfile = open('.\\' + group + '\\' + str(mess_id), 'a', encoding="utf-8")
for line in info.lines:
outfile.write(remove_non_ascii_2(str(line)) + '\n')
outfile.close()
for g_tuple in groups_list_tuple:
#print(g_tuple) # DEBUG_LINE
# Parse group_list info
group = g_tuple[0]
last = g_tuple[1]
first = g_tuple[2]
flag = g_tuple[3]
# Parse newsgroup info
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
futures = executor.submit(download_nntp_file)

I can't test it with XSUseNet.
I wouldn't use global variables because when processes work at the same time then they may get the same values from these variables.
You should rather send values as parameters to functions.
Something like this:
def download_nntp_file(g_tuple):
# ... code which uses `g_tuple` instead of global variables ...
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
for g_tuple in groups_list_tuple:
executor.submit(download_nntp_file, g_tuple)
But I would be simpler to use map() instead of submit() because it gets list with arguments and it doesn't need for-loop
def download_nntp_file(g_tuple):
# ... code which uses `g_tuple` instead of global variables ...
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
executor.map(download_nntp_file, groups_list_tuple)

Speed up Boto3 file transfer across buckets

I want to copy a sub-subfolder in an S3 bucket into a different bucket using Python (boto3).
However, the process is painfully slow.
If I copy the folder "by hand" straight on S3 from the browser, the process takes 72 seconds (for a folder with around 140 objects, total size roughly 1.0 GB).
However, if I try to copy it with boto3, it takes 9 times longer (653 seconds).
This is the code that I am using, re-adapted from the boto3 documentation and various answers here in SO:
import boto3
s3 = boto3.resource('s3')
# define source bucket
src_bucket_name = 'bucket_1'
prefix = 'folder_1/'
client = boto3.client('s3')
src_bucket = s3.Bucket(src_bucket_name)
# define destination bucket
dest_bucket_name = 'bucket_2'
dest_bucket = s3.Bucket(dest_bucket_name)
folder = "folder_1/subfolder_1"
response_sub = client.list_objects_v2(Bucket=src_bucket_name, Prefix = folder)
# list files to be copied (select only images, but in this folder there are only images anyway)
files_src = [prefix['Key'] for prefix in response_sub['Contents'] if prefix['Key'].split('.')[-1].lower() in ['jpg','jpeg','png','tiff'] ]
# list of file names after copy
dest_prefix = 'folder_1/subfolder_1/'
files_dest = [dest_prefix+i for i in files_src]
for src,dest in zip(files_src,files_dest):
copy_source = {
'Bucket': src_bucket_name,
'Key': src
}
dest_bucket.copy(copy_source, dest)
Note that up to the last for loop, the code takes a couple of seconds only to run.
Any idea of how to speed up this? Am I doing something stupid/should use some other way of copying files/entire folders?

Thanks to #Suyog Shimpi (who pointed to a similar SO post), I was able to significantly speed up the copying process.
Here the code slightly readapted from the other post:
import os
import boto3
import botocore
import boto3.s3.transfer as s3transfer
import tqdm
s3 = boto3.resource('s3')
# define source bucket
src_bucket_name = 'bucket_1'
prefix = 'folder_1/'
client = boto3.client('s3')
src_bucket = s3.Bucket(src_bucket_name)
# define destination bucket
dest_bucket_name = 'bucket_2'
dest_bucket = s3.Bucket(dest_bucket_name)
folder = "folder_1/subfolder_1"
response_sub = client.list_objects_v2(Bucket=src_bucket_name, Prefix = folder)
# list files to be copied (select only images, but in this folder there are only images anyway)
files_src = [prefix['Key'] for prefix in response_sub['Contents'] if prefix['Key'].split('.')[-1].lower() in ['jpg','jpeg','png','tiff'] ]
# list of file names after copy
dest_prefix = 'folder_1/subfolder_1/'
files_dest = [dest_prefix+i for i in files_src]
botocore_config = botocore.config.Config(max_pool_connections=20)
s3client = boto3.client('s3', config=botocore_config)
transfer_config = s3transfer.TransferConfig(
use_threads=True,
max_concurrency=20,
)
# note that timing the process is optional
# total_size of the files can be obtained with boto3, or on the browser
%time
progress = tqdm.tqdm(
desc='upload',
total=total_size, unit='B', unit_scale=1,
position=0,
bar_format='{desc:<10}{percentage:3.0f}%|{bar:10}{r_bar}')
s3t = s3transfer.create_transfer_manager(s3client, transfer_config)
for src,dest in zip(files_src,files_dest):
copy_source = {
'Bucket': src_bucket_name,
'Key': src
}
s3t.copy(copy_source=copy_source,
bucket = dest_bucket_name,
key = dest,
subscribers=[s3transfer.ProgressCallbackInvoker(progress.update),],
)
# close transfer job
s3t.shutdown()
progress.close();

Thanks Fraccalo for your solution, it helped me a lot!
I adjusted it a little so that we can copy more than 1000 files:
import boto3
import botocore
import boto3.s3.transfer as s3transfer
import tqdm
s3 = boto3.resource('s3')
# define source bucket
src_bucket_name = 'bucket_1'
prefix = 'folder_1/'
client = boto3.client('s3')
src_bucket = s3.Bucket(src_bucket_name)
# define destination bucket
dest_bucket_name = 'bucket_2'
dest_bucket = s3.Bucket(dest_bucket_name)
folder = "folder_1/subfolder_1"
files_src = []
bucket_size = 0
# use paginator to read more than 1000 files
paginator = client.get_paginator('list_objects_v2')
operation_parameters = {'Bucket': src_bucket_name,
'Prefix': folder}
page_iterator = paginator.paginate(**operation_parameters)
for page in page_iterator:
if page.get('Contents', None):
files_src.extend([prefix['Key'] for prefix in page['Contents']])
bucket_size += sum(obj['Size'] for obj in page['Contents'])
# list of file names after copy
dest_prefix = 'folder_1/subfolder_1/'
files_dest = [dest_prefix+i for i in files_src]
botocore_config = botocore.config.Config(max_pool_connections=20)
s3client = boto3.client('s3', config=botocore_config)
transfer_config = s3transfer.TransferConfig(
use_threads=True,
max_concurrency=20,
)
progress = tqdm.tqdm(
desc='upload',
total=bucket_size, unit='B', unit_scale=1,
position=0,
bar_format='{desc:<10}{percentage:3.0f}%|{bar:10}{r_bar}')
s3t = s3transfer.create_transfer_manager(s3client, transfer_config)
for src,dest in zip(files_src,files_dest):
copy_source = {
'Bucket': src_bucket_name,
'Key': src
}
s3t.copy(copy_source=copy_source,
bucket = dest_bucket_name,
key = dest,
subscribers=[s3transfer.ProgressCallbackInvoker(progress.update),],
)
# close transfer job
s3t.shutdown()
progress.close();

How to extract nested json from api

I need help to get some json data from Pipedrive API(just a resquest.get), however, as expected, the json file is not formatted at all.
As you can see in my python script bellow, I'll upload that file to my S3 Bucket, so I can create tables in AWS Glue.
I'm having a lot of trouble to make this json file "readable" in AWS structure.
Sample of json file I'm working: [I need the "deals" section]
{"success":true,"data":[{"period_start":"2020-11-12 00:00:00","period_end":"2020-11-12 23:59:59","deals":[{"id":xx,"creator_user_id":xx,"user_id":XX,"person_id":XX,"org_id":XX,"stage_id":X}]}]}
Python script:
from datetime import datetime
import boto3
import requests
from jsonpath_ng import parse
import pandas as pd
now = datetime.now()
day_int = now.strftime("%d")
month_int = now.strftime("%m")
year_int = now.strftime("%Y")
yesterday = (int(day_int) - 1)
if yesterday == 0:
yesterday = 31
#today = now.strftime("%Y" + "-" + "%m" + "-" + str(yesterday))
today = '2020-11-12'
response_01 = requests.get(
'https://api.pipedrive.com/v1/deals/timeline?start_date=' + str(today) + '&interval=day&amount=' + str(
days) + '&field_key=update_time&totals_convert_currency=' + currency + '&api_token=' + token_api)
raw_data = response_01.json()
x = ([match.value for match in parse('$..deals.[*].[*]').find(raw_data)])
y = json.dumps(x, default= str)
s3 = boto3.resource(
's3',
region_name=data_region,
aws_access_key_id=key_id,
aws_secret_access_key=access_key
)
client = boto3.client('s3')
client.put_object(Body=str(raw_data), Bucket='bucket_key', Key='object_key') ```

Python ftplib and downloading filemasks

I've made a lot of progress on this, and can now download 3 of 4 files just fine, however, one, the Wisconsin file contains timestamps that I can't have removed, and vary day to day and I'm struggling to figure out how to get the wildcards to work on those values with regular expressions. I've posted my revised code below:
Examples of the file names are:
BCW_Daily SDP Yield.rpt2020-02-17***-09-02-32***.csv
hbc_platelet_daily_02102020.csv
MBC_ROLLING_YIELD_02172020.CSV
IBC_SDP_Rolling_7Days_021720.CSV
Any help is appreciated.
import datetime
import ftplib
import os
ftpdir =('/home/hospserv/inbound/platelet/')
savedir = "C:/FTP/"
archivedir = "C:/ftparchive/"
os.chdir(savedir)
today = datetime.date.today()
iltoday = datetime.date.today() - datetime.timedelta(days=7)
widate = (f"{today:%Y-%m-%d}")
ildate = (f"{iltoday:%m%d%Y}")
midate = (f"{today:%m%d%Y}")
indate = (f"{today:%m%d%y}")
filenameIN = ('IBC_SDP_Rolling_7Days_'+indate+'.CSV')
filenameWI = ('BCW_SDP_Rolling_7Days.rpt'***+widate+'*'+***'.csv')
filenameIL = ('hbc_platelet_daily_'+ildate+'.csv')
filenameMI = ('MBC_ROLLING_YIELD_'+midate+'.CSV')
dlfiles = [filenameMI,filenameIN,filenameWI,filenameIL]
connection = ftplib.FTP(host='xxx',user='xxx',passwd='xxx')
welcome = ftplib.FTP.getwelcome(connection)
print(welcome)
connection.cwd(ftpdir)
ftp_list = connection.nlst()
print(ftp_list)
for x in dlfiles:
if x in ftp_list:
connection.retrbinary("RETR "+x, open(os.path.join(savedir, x), 'wb').write)
else:
print(x+' fail')
connection.quit()

Solved it:
# import modules
import fnmatch
import datetime
import ftplib
import os
#define variables
ftpdir =('/home/hospserv/inbound/platelet/')
savedir = "C:/FTP/"
archivedir = "C:/ftparchive/"
filedir = "C:/DailyData/SDPS/"
os.chdir(savedir)
today = datetime.date.today()
iltoday = datetime.date.today() - datetime.timedelta(days=7)
widate = (f"{today:%Y-%m-%d}")
ildate = (f"{iltoday:%m%d%Y}")
midate = (f"{today:%m%d%Y}")
indate = (f"{today:%m%d%y}")
filenameIN = ('IBC_SDP_Rolling_7Days_'+indate+'.CSV')
pattern = ('BCW_SDP_Rolling_7Days.rpt'+widate+'*'+'.csv')
filenameIL = ('hbc_platelet_daily_'+ildate+'.csv')
filenameMI = ('MBC_ROLLING_YIELD_'+midate+'.CSV')
#create FTP connection
connection = ftplib.FTP(xxxxxxx)
connection.cwd(ftpdir)
#generate file list on FTP
ftp_list = connection.nlst()
#create wildcard string for WI file
wistring = fnmatch.filter(ftp_list,pattern)
filenameWI = str(wistring[0])
dlfiles = [filenameMI,filenameIN,filenameIL,filenameWI]
#download files from FTP to local
for x in dlfiles:
if x in ftp_list:
connection.retrbinary("RETR "+x, open(os.path.join(savedir, x), 'wb').write)
connection.quit()

boto set_contents_from_filename memory leak

I'm seeing a memory leak when using boto to upload files. Am I doing something wrong here? Memory usage seems to increase less consistently if I remove the sleep or if I don't alternate between two different buckets.
import time, resource, os
import boto
conn = boto.connect_s3()
for i in range(20):
print resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
path = 'test.png'
bucket = conn.lookup('jca-screenshots-' + ('thumbs' if i % 2 == 0 else 'normal'))
k = boto.s3.key.Key(bucket)
k.key = os.path.basename(path)
k.set_contents_from_filename(path)
time.sleep(5)
Sample output:
12406784
13123584
13242368
13344768
13398016
13422592
13484032
13524992
13553664
13590528
13656064
13664256

Solved by switching libs: https://github.com/tax/python-requests-aws
import time, resource, os
import requests
from awsauth import S3Auth
with open(os.path.expanduser("~/.boto")) as f:
lines = f.read().splitlines()
ACCESS_KEY = lines[1].split(' = ')[1]
SECRET_KEY = lines[2].split(' = ')[1]
for i in range(20):
print resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
url = 'http://{}.s3.amazonaws.com/{}'.format(
'jca-screenshots-' + ('thumbs' if i % 2 == 0 else 'normal'), 'test.png')
with open('test.png', 'rb') as f:
resp = requests.put(url, data=f, auth=S3Auth(ACCESS_KEY, SECRET_KEY))
print 'resp:', resp
time.sleep(5)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

AWS Lambda loop through files using boto3 - python

Related

Threading NNTP, how? (Newbie here)

Speed up Boto3 file transfer across buckets

How to extract nested json from api

Python ftplib and downloading filemasks

boto set_contents_from_filename memory leak

Categories

Resources