how make use of multiprocessing in python to make api calls simultaneously - python

// function to verify user login
def get_authenticated_service(file,client_secret_file):
credentials = None
if os.path.exists(file):
print('Loading Credentials From File...')
with open(file, 'rb') as token:
credentials = pickle.load(token)
return build(API_SERVICE_NAME, API_VERSION, credentials = credentials)
if not credentials or not credentials.valid:
if credentials and credentials.expired and credentials.refresh_token:
print('Refreshing Access Token...')
credentials.refresh(Request())
else:
print('Fetching New Tokens...')
flow = InstalledAppFlow.from_client_secrets_file(
client_secret_file,
scopes=[
'https://www.googleapis.com/auth/youtube.readonly',
'https://www.googleapis.com/auth/youtube.upload',
'https://www.googleapis.com/auth/youtube.force-ssl'
]
)
flow.run_local_server(port=8000, prompt='consent',
authorization_prompt_message='')
credentials = flow.credentials
# Save the credentials for the next run
with open(file, 'wb') as f:
print('Saving Credentials for Future Use...')
pickle.dump(credentials, f)
return build(API_SERVICE_NAME, API_VERSION, credentials = credentials)
//function for verify user and make https body request
def initialize_upload( options, file, tokenFile, client_secret_file):
youtube = get_authenticated_service(tokenFile,client_secret_file)
tags = None
if options.keywords:
tags = options.keywords.split(',')
body=dict(
snippet=dict(
title=options.title,
description=options.description,
tags=tags,
categoryId=options.category
),
status=dict(
privacyStatus=options.privacyStatus
)
)
# Call the API's videos.insert method to create and upload the video.
print("running")
insert_request = youtube.videos().insert(
part=','.join(body.keys()),
body=body,
media_body=MediaFileUpload(file, chunksize=-1, resumable=True)
)
print("multiprocessing start")
resumable_upload(insert_request)
print("multiprocessing ends")
//function for uploading file
def resumable_upload(request):
response = None
error = None
retry = 0
print("running by " + multiprocessing.current_process().name)
while response is None:
try:
print('Uploading file...')
status, response = request.next_chunk()
print('Uploading file done...')
if response is not None:
if 'id' in response:
print('Video id "%s" was successfully uploaded.' % response['id'])
else:
exit('The upload failed with an unexpected response: %s' % response)
except HttpError as e:
if e.resp.status in RETRIABLE_STATUS_CODES:
error = 'A retriable HTTP error %d occurred:\n%s' % (e.resp.status,e.content)
else:
raise
except RETRIABLE_EXCEPTIONS as e:
error = 'A retriable error occurred: %s' % e
if error is not None:
print(error)
retry += 1
if retry > MAX_RETRIES:
exit('No longer attempting to retry.')
max_sleep = 2 ** retry
# sleep_seconds = random.random() * max_sleep
sleep_seconds = 0.5
print('Sleeping %f seconds and then retrying...' % sleep_seconds)
time.sleep(sleep_seconds)
// then title, description etc passing as arguments and making multiprocessing call
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--title', help='Video title', default='Android react native code sample(ANdroid emulator)')
parser.add_argument('--description', help='Android react native code sample description',
default='Android react native code sample description(ANdroid emulator)')
parser.add_argument('--category', default='22',
help='Numeric video category. ' +
'See https://developers.google.com/youtube/v3/docs/videoCategories/list')
parser.add_argument('--keywords', help='react native, react, android, emulator',
default='react native, react')
parser.add_argument('--privacyStatus', choices=VALID_PRIVACY_STATUSES,
default='private', help='Video privacy status.')
args = parser.parse_args()
try:
l = []
start = time.perf_counter()
t1 = multiprocessing.Process(target=initialize_upload, args=(args, "android1.mp4", 'token1.pickle','client_secret.json'))
t2 = multiprocessing.Process(target=initialize_upload, args=(args, "android1.mp4", 'token2.pickle','client_secret.json'))
t1.start()
t2.start()
t1.join()
t2.join()
finish = time.perf_counter()
print(f'Finished in {round(finish-start, 2)} seconds..')
except HttpError as e:
print('An HTTP error %d occurred:\n%s' % (e.resp.status, e.content))
if i try to make this call it takes 12.09 secs and without multiprocessing (i.e calling simultaneously one after another it takes 12.39 secs). after debugging little bit i get to know that till print("uploading file...) (in resumable_upload()) its working properly but after that its like working one at a time. Can anyone suggest how i can make him work with multiprocessing ? below is the output of this file

Related

Google oauth + django - cant authenticate from saved data

Okay, so i got the flow working, but it only works on the current session, if i try to load the data in, even with just a refresh the credentials dont hold up.
So this works which is where oauth redirects to after user accepts prompts:
def oauth_redir(request):
u=Employee.objects.filter(dashboard_user=request.user)
if not u:
u=Employee.objects.create(dashboard_user=request.user)
else:
u=u[0]
flow = google_auth_oauthlib.flow.Flow.from_client_secrets_file(
CLIENT_SECRETS_FILE, scopes=SCOPES, state=request.GET.get("state"))
flow.redirect_uri = REDIRECT_URL
flow.fetch_token(authorization_response=request.build_absolute_uri().replace("http:","https:"))
#saving credentials for future use
credentials = flow.credentials
if not Emp_google_creds.objects.filter(employee=u):
Emp_google_creds.objects.create(
token= credentials.token,
refresh_token= credentials.refresh_token,
token_uri = credentials.token_uri,
client_id = credentials.client_id,
client_secret= credentials.client_secret,
scopes = " ".join(credentials.scopes),
employee = u
)
else:
creds=Emp_google_creds.objects.get(employee=u)
creds.token = credentials.token,
creds.refresh_token = credentials.refresh_token,
creds.token_uri = credentials.token_uri,
creds.client_id = credentials.client_id,
creds.client_secret = credentials.client_secret,
creds.scopes = " ".join(credentials.scopes),
creds.save()
if not credentials or not credentials.valid:
print(credentials, credentials.valid, credentials.expiry)
# credentials.refresh(Request())
# return redirect("/calendar/cal_auth/")
try:
service = googleapiclient.discovery.build('calendar', 'v3', credentials=credentials)
# Call the Calendar API
now = datetime.datetime.utcnow().isoformat() + 'Z' # 'Z' indicates UTC time
print('Getting the upcoming 10 events')
events_result = service.events().list(calendarId='primary', timeMin=now,
maxResults=10, singleEvents=True,
orderBy='startTime').execute()
events = events_result.get('items', [])
if not events:
print('No upcoming events found.')
return
# Prints the start and name of the next 10 events
for event in events:
start = event['start'].get('dateTime', event['start'].get('date'))
print(start, event['summary'])
except HttpError as error:
print('An error occurred: %s' % error)
print("end of redir func")
return render(request,"schedule.html")
this loads the schedule as expected. however if you refresh in order to use the stored credentials they never work, they never refresh, i'm unsure on what to do.
def calendar_home(request):
u = Employee.objects.filter(dashboard_user=request.user)
if not u:
u = Employee.objects.create(dashboard_user=request.user)
else:
u = u[0]
print(Emp_google_creds.objects.filter(employee=u))
if not Emp_google_creds.objects.filter(employee=u):
return redirect("/calendar/cal_auth/")
else:
creds=Emp_google_creds.objects.filter(employee=u)[0]
gcreds=google.oauth2.credentials.Credentials.from_authorized_user_info(creds.to_dict())
if not gcreds or not gcreds.valid:
print(gcreds, gcreds.valid, gcreds.expiry)
# gcreds.refresh(Request())
return redirect("/calendar/cal_auth/")
try:
service = googleapiclient.discovery.build('calendar', 'v3', credentials=gcreds)
# Call the Calendar API
now = datetime.datetime.utcnow().isoformat() + 'Z' # 'Z' indicates UTC time
print('Getting the upcoming 10 events')
events_result = service.events().list(calendarId='primary', timeMin=now,
maxResults=10, singleEvents=True,
orderBy='startTime').execute()
events = events_result.get('items', [])
if not events:
print('No upcoming events found.')
return
# Prints the start and name of the next 10 events
for event in events:
start = event['start'].get('dateTime', event['start'].get('date'))
print(start, event['summary'])
except HttpError as error:
print('An error occurred: %s' % error)
return render(request,"schedule.html")
basically this:
print(gcreds, gcreds.valid, gcreds.expiry) =>
<google.oauth2.credentials.Credentials object at 0x7f322a00b340> False 2022-06-03 14:44:02.232624
and if i comment the redirect out i get:
('invalid_client: The OAuth client was not found.', {'error': 'invalid_client', 'error_description': 'The OAuth client was not found.'})
caused on this line:
events_result = service.events().list(calendarId='primary', timeMin=now, ...
I just went through all of this last week. Maybe try building the credentials manually like I did instead of using Credentials.from_authorized_user_info(). I found out the hard way that Google likes to change things without updating their documentation.
user_creds = json.loads(Emp_google_creds.objects.filter(employee=u)[0])
creds=Credentials(
token=user_creds['token'],
refresh_token=user_creds['refresh_token'],
token_uri=user_creds['token_uri'],
client_id=user_creds['client_id'],
client_secret=user_creds['client_secret']
)
service = build('calendar', 'v3', credentials=creds)

Python SIEM log collector hangs randomly

I am trying to troubleshoot a script for Mimecast's API. The script runs fine for the most part, but a few times, I have noticed that it stops pulling logs and generally appears to be a hung process. After restarting the script and manually pushing logs to the syslog server, it starts working again without issue. I am not able to reproduce this issue at will.
The script is supposed to do the following:
Authenticate against Mimecast's API
Sign responses
download, extract and save log files to log dir
utilize a tokenized header to determine which file was downloaded in the last request. Should save the token ID within a file in the checkpoint directory
Push files to remote syslog server
Output any errors and info to console
Below is the sample code from Mimecast.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging.handlers
import json
import os
import requests
import base64
import uuid
import datetime
import hashlib
import shutil
import hmac
import time
from zipfile import ZipFile
import io
# Set up variables
APP_ID = "YOUR DEVELOPER APPLICATION ID"
APP_KEY = "YOUR DEVELOPER APPLICATION KEY"
URI = "/api/audit/get-siem-logs"
EMAIL_ADDRESS = 'EMAIL ADDRESS OF YOUR ADMINISTRATOR'
ACCESS_KEY = 'ACCESS KEY FOR YOUR ADMINISTRATOR'
SECRET_KEY = 'SECRET KEY FOR YOUR ADMINISTRATOR'
LOG_FILE_PATH = "FULLY QUALIFIED PATH TO FOLDER TO WRITE LOGS"
CHK_POINT_DIR = 'FULLY QUALIFIED PATH TO FOLDER TO WRITE PAGE TOKEN'
# Set True to output to syslog, false to only save to file
syslog_output = False
# Enter the IP address or hostname of your syslog server
syslog_server = 'localhost'
# Change this to override default port
syslog_port = 514
# delete files after fetching
delete_files = True
# Set threshold in number of files in log file directory
log_file_threshold = 10000
# Set up logging (in this case to terminal)
log = logging.getLogger(__name__)
log.root.setLevel(logging.DEBUG)
log_formatter = logging.Formatter('%(levelname)s %(message)s')
log_handler = logging.StreamHandler()
log_handler.setFormatter(log_formatter)
log.addHandler(log_handler)
# Set up syslog output
syslog_handler = logging.handlers.SysLogHandler(address=(syslog_server, syslog_port))
syslog_formatter = logging.Formatter('%(message)s')
syslog_handler.setFormatter(syslog_formatter)
syslogger = logging.getLogger(__name__)
syslogger = logging.getLogger('SysLogger')
syslogger.addHandler(syslog_handler)
# Supporting methods
def get_hdr_date():
return datetime.datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S UTC")
def read_file(file_name):
try:
with open(file_name, 'r') as f:
data = f.read()
return data
except Exception as e:
log.error('Error reading file ' + file_name + '. Cannot continue. Exception: ' + str(e))
quit()
def write_file(file_name, data_to_write):
if '.zip' in file_name:
try:
byte_content = io.BytesIO(data_to_write)
zip_file = ZipFile(byte_content)
zip_file.extractall(LOG_FILE_PATH)
except Exception as e:
log.error('Error writing file ' + file_name + '. Cannot continue. Exception: ' + str(e))
quit()
else:
try:
with open(file_name, 'w') as f:
f.write(data_to_write)
except Exception as e:
log.error('Error writing file ' + file_name + '. Cannot continue. Exception: ' + str(e))
quit()
def get_base_url(email_address):
# Create post body for request
post_body = dict()
post_body['data'] = [{}]
post_body['data'][0]['emailAddress'] = email_address
# Create variables required for request headers
request_id = str(uuid.uuid4())
request_date = get_hdr_date()
headers = {'x-mc-app-id': APP_ID, 'x-mc-req-id': request_id, 'x-mc-date': request_date}
# Send request to API
log.debug('Sending request to https://api.mimecast.com/api/discover-authentication with request Id: ' +
request_id)
try:
r = requests.post(url='https://api.mimecast.com/api/login/discover-authentication',
data=json.dumps(post_body), headers=headers)
# Handle Rate Limiting
if r.status_code == 429:
log.warning('Rate limit hit. sleeping for ' + str(r.headers['X-RateLimit-Reset'] * 1000))
time.sleep(r.headers['X-RateLimit-Reset'] * 1000)
except Exception as e:
log.error('Unexpected error getting base url. Cannot continue.' + str(e))
quit()
# Handle error from API
if r.status_code != 200:
log.error('Request returned with status code: ' + str(r.status_code) + ', response body: ' +
r.text + '. Cannot continue.')
quit()
# Load response body as JSON
resp_data = json.loads(r.text)
# Look for api key in region region object to get base url
if 'region' in resp_data["data"][0]:
base_url = resp_data["data"][0]["region"]["api"].split('//')
base_url = base_url[1]
else:
# Handle no region found, likely the email address was entered incorrectly
log.error(
'No region information returned from API, please check the email address.'
'Cannot continue')
quit()
return base_url
def post_request(base_url, uri, post_body, access_key, secret_key):
# Create variables required for request headers
request_id = str(uuid.uuid4())
request_date = get_hdr_date()
unsigned_auth_header = '{date}:{req_id}:{uri}:{app_key}'.format(
date=request_date,
req_id=request_id,
uri=uri,
app_key=APP_KEY
)
hmac_sha1 = hmac.new(
base64.b64decode(secret_key),
unsigned_auth_header.encode(),
digestmod=hashlib.sha1).digest()
sig = base64.encodebytes(hmac_sha1).rstrip()
headers = {
'Authorization': 'MC ' + access_key + ':' + sig.decode(),
'x-mc-app-id': APP_ID,
'x-mc-date': request_date,
'x-mc-req-id': request_id,
'Content-Type': 'application/json'
}
try:
# Send request to API
log.debug('Sending request to https://' + base_url + uri + ' with request Id: ' + request_id)
r = requests.post(url='https://' + base_url + uri, data=json.dumps(post_body), headers=headers)
# Handle Rate Limiting
if r.status_code == 429:
log.warning('Rate limit hit. sleeping for ' + str(r.headers['X-RateLimit-Reset'] * 1000))
time.sleep(r.headers['X-RateLimit-Reset'] * 1000)
r = requests.post(url='https://' + base_url + uri, data=json.dumps(post_body), headers=headers)
# Handle errors
except Exception as e:
log.error('Unexpected error connecting to API. Exception: ' + str(e))
return 'error'
# Handle errors from API
if r.status_code != 200:
log.error('Request to ' + uri + ' with , request id: ' + request_id + ' returned with status code: ' +
str(r.status_code) + ', response body: ' + r.text)
return 'error'
# Return response body and response headers
return r.content, r.headers
def get_mta_siem_logs(checkpoint_dir, base_url, access_key, secret_key):
uri = "/api/audit/get-siem-logs"
# Set checkpoint file name to store page token
checkpoint_filename = os.path.join(checkpoint_dir, 'get_mta_siem_logs_checkpoint')
# Build post body for request
post_body = dict()
post_body['data'] = [{}]
post_body['data'][0]['type'] = 'MTA'
post_body['data'][0]['compress'] = True
if os.path.exists(checkpoint_filename):
post_body['data'][0]['token'] = read_file(checkpoint_filename)
# Send request to API
resp = post_request(base_url, uri, post_body, access_key, secret_key)
now = datetime.datetime.now().strftime("%a %b %d %H:%M:%S %Y")
# Process response
if resp != 'error':
resp_body = resp[0]
resp_headers = resp[1]
content_type = resp_headers['Content-Type']
# End if response is JSON as there is no log file to download
if content_type == 'application/json':
log.info('No more logs available')
return False
# Process log file
elif content_type == 'application/octet-stream':
file_name = resp_headers['Content-Disposition'].split('=\"')
file_name = file_name[1][:-1]
# Save files to LOG_FILE_PATH
write_file(os.path.join(LOG_FILE_PATH, file_name), resp_body)
# Save mc-siem-token page token to check point directory
write_file(checkpoint_filename, resp_headers['mc-siem-token'])
try:
if syslog_output is True:
for filename in os.listdir(LOG_FILE_PATH):
file_creation_time = time.ctime(os.path.getctime(LOG_FILE_PATH + "/" + filename))
if now < file_creation_time or now == file_creation_time:
log.info('Loading file: ' + filename + ' to output to ' + syslog_server + ':' + str(syslog_port))
with open(file=os.path.join(LOG_FILE_PATH, filename), mode='r', encoding='utf-8') as log_file:
lines = log_file.read().splitlines()
for line in lines:
syslogger.info(line)
log.info('Syslog output completed for file ' + filename)
except Exception as e:
log.error('Unexpected error writing to syslog. Exception: ' + str(e))
# return true to continue loop
return True
else:
# Handle errors
log.error('Unexpected response')
for header in resp_headers:
log.error(header)
return False
def run_script():
# discover base URL
try:
base_url = get_base_url(email_address=EMAIL_ADDRESS)
except Exception as e:
log.error('Error discovering base url for ' + EMAIL_ADDRESS + ' . Exception: ' + str(e))
quit()
# Request log data in a loop until there are no more logs to collect
try:
log.info('Getting MTA log data')
while get_mta_siem_logs(checkpoint_dir=CHK_POINT_DIR, base_url=base_url, access_key=ACCESS_KEY,
secret_key=SECRET_KEY) is True:
log.info('Getting more MTA log files')
except Exception as e:
log.error('Unexpected error getting MTA logs ' + (str(e)))
file_number = len([name for name in os.listdir(LOG_FILE_PATH) if os.path.isfile(name)])
if delete_files or file_number >= log_file_threshold:
for filename in os.listdir(LOG_FILE_PATH):
file_path = os.path.join(LOG_FILE_PATH, filename)
try:
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
print('Failed to delete %s. Reason: %s' % (file_path, e))
quit()
# Run script
run_script()
It seems like it may potentially be a race condition but I am not sure how to confirm since I can't reproduce it. I notice that SumoLogic has a modified version of this script as well with a different methodology for managing the files/paths. If this script works better than the main sample script above, would anybody be able to explain WHY? I haven't had any issues with it yet.
https://github.com/SumoLogic/sumologic-content/blob/master/MimeCast/SumoLogic-Mimecast-Data-Collection/siem_collection.py

How to mock functionality of boto3 module using pytest

I have a custom module written called sqs.py. The script will do the following:
Get a message from AWS SQS
Get the AWS S3 path to delete
Delete the path
Send a confirmation email to the user
I'm trying to write unit tests for this module that will verify the code will execute as expected and that it will raise exceptions when they do occur.
This means I will need to mock the response from Boto3 calls that I make. My problem is that the code will first establish the SQS client to obtain the message and then a second call to establish the S3 client. I'm not sure how to mock these 2 independent calls and be able to fake a response so I can test my script's functionality. Perhaps my approach is incorrect. At any case, any advice on how to do this properly is appreciated.
Here's how the code looks like:
import boto3
import json
import os
import pprint
import time
import asyncio
import logging
from send_email import send_email
queue_url = 'https://xxxx.queue.amazonaws.com/1234567890/queue'
def shutdown(message):
""" Sends shutdown command to OS """
os.system(f'shutdown +5 "{message}"')
def send_failure_email(email_config: dict, error_message: str):
""" Sends email notification to user with error message attached. """
recipient_name = email_config['recipient_name']
email_config['subject'] = 'Subject: Restore Failed'
email_config['message'] = f'Hello {recipient_name},\n\n' \
+ 'We regret that an error has occurred during the restore process. ' \
+ 'Please try again in a few minutes.\n\n' \
+ f'Error: {error_message}.\n\n' \
try:
send_email(email_config)
except RuntimeError as error_message:
logging.error(f'ERROR: cannot send email to user. {error_message}')
async def restore_s3_objects(s3_client: object, p_bucket_name: str, p_prefix: str):
"""Attempts to restore objects specified by p_bucket_name and p_prefix.
Returns True if restore took place, false otherwise.
"""
is_truncated = True
key_marker = None
key = ''
number_of_items_restored = 0
has_restore_occured = False
logging.info(f'performing restore for {p_bucket_name}/{p_prefix}')
try:
while is_truncated == True:
if not key_marker:
version_list = s3_client.list_object_versions(
Bucket = p_bucket_name,
Prefix = p_prefix)
else:
version_list = s3_client.list_object_versions(
Bucket = p_bucket_name,
Prefix = p_prefix,
KeyMarker = key_marker)
if 'DeleteMarkers' in version_list:
logging.info('found delete markers')
delete_markers = version_list['DeleteMarkers']
for d in delete_markers:
if d['IsLatest'] == True:
key = d['Key']
version_id = d['VersionId']
s3_client.delete_object(
Bucket = p_bucket_name,
Key = key,
VersionId = version_id
)
number_of_items_restored = number_of_items_restored + 1
is_truncated = version_list['IsTruncated']
logging.info(f'is_truncated: {is_truncated}')
if 'NextKeyMarker' in version_list:
key_marker = version_list['NextKeyMarker']
if number_of_items_restored > 0:
has_restore_occured = True
return has_restore_occured
except Exception as error_message:
raise RuntimeError(error_message)
async def main():
if 'AWS_ACCESS_KEY_ID' in os.environ \
and 'AWS_SECRET_ACCESS_KEY' in os.environ \
and os.environ['AWS_ACCESS_KEY_ID'] != '' \
and os.environ['AWS_SECRET_ACCESS_KEY'] != '':
sqs_client = boto3.client(
'sqs',
aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
verify=False
)
s3_client = boto3.client(
's3',
aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
verify=False
)
else:
sqs_client = boto3.client(
'sqs',
verify=False,
)
s3_client = boto3.client(
's3',
verify=False,
)
received_message = sqs_client.receive_message(
QueueUrl=queue_url,
AttributeNames=['All'],
VisibilityTimeout=10,
WaitTimeSeconds=20, # Wait up to 20 seconds for a message to arrive
)
if 'Messages' in received_message \
and len(received_message['Messages']) > 0:
# NOTE: Initialize email configuration
receipient_email = 'support#example.com'
username = receipient_email.split('#')[0]
fullname_length = len(username.split('.'))
fullname = f"{username.split('.')[0]}" # Group name / First name only
if (fullname_length == 2): # First name and last name available
fullname = f"{username.split('.')[0]} {username.split('.')[1]}"
fullname = fullname.title()
email_config = {
'destination': receipient_email,
'recipient_name': fullname,
'subject': 'Subject: Restore Complete',
'message': ''
}
try:
receipt_handle = received_message['Messages'][0]['ReceiptHandle']
except Exception as error_message:
logging.error(error_message)
send_failure_email(email_config, error_message)
shutdown(f'{error_message}')
try:
data = received_message['Messages'][0]['Body']
data = json.loads(data)
logging.info('A SQS message for a restore has been received.')
except Exception as error_message:
message = f'Unable to obtain and parse message body. {error_message}'
logging.error(message)
send_failure_email(email_config, message)
shutdown(f'{error_message}')
try:
bucket = data['bucket']
prefix = data['prefix']
except Exception as error_message:
message = f'Retrieving bucket name and prefix failed. {error_message}'
logging.error(message)
send_failure_email(email_config, message)
shutdown(f'{error_message}')
try:
logging.info(f'Initiating restore for path: {bucket}/{prefix}')
restore_was_performed = await asyncio.create_task(restore_s3_objects(s3_client, bucket, prefix))
if restore_was_performed is True:
email_config['message'] = f'Hello {fullname},\n\n' \
+ f'The files in the path \'{bucket}/{prefix}\' have been restored. ' \
send_email(email_config)
logging.info('Restore complete. Shutting down.')
else:
logging.info('Path does not require restore. Shutting down.')
shutdown(f'shutdown +5 "Restore successful! System will shutdown in 5 mins"')
except Exception as error_message:
message = f'File restoration failed. {error_message}'
logging.error(message)
send_failure_email(email_config, message)
shutdown(f'{error_message}')
try:
sqs_client.delete_message(
QueueUrl=queue_url,
ReceiptHandle=receipt_handle,
)
except Exception as error_message:
message = f'Deleting restore session from SQS failed. {error_message}'
logging.error(message)
send_failure_email(email_config, message)
shutdown(f'{error_message}')
if __name__ == '__main__':
logging.basicConfig(filename='restore.log',level=logging.INFO)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
The only way I was able to mock Boto3 is rebuilding a small class that represents the actual method structure. This is because Boto3 uses dynamic methods and all the resource level methods are created at runtime.
This might not be industry standard but I wasn't able to get any of the methods I found on the internet to work most of the time and this worked pretty well for me and requires minimal effort (comparing to some of the solutions I found).
class MockClient:
def __init__(self, region_name, aws_access_key_id, aws_secret_access_key):
self.region_name = region_name
self.aws_access_key_id = aws_access_key_id
self.aws_secret_access_key = aws_secret_access_key
self.MockS3 = MockS3()
def client(self, service_name, **kwargs):
return self.MockS3
class MockS3:
def __init__(self):
self.response = None # Test your mock data from S3 here
def list_object_versions(self, **kwargs):
return self.response
class S3TestCase(unittest.TestCase):
def test_restore_s3_objects(self):
# Given
bucket = "testBucket" # Test this to something that somewahat realistic
prefix = "some/prefix" # Test this to something that somewahat realistic
env_vars = mock.patch.dict(os.environ, {"AWS_ACCESS_KEY_ID": "abc",
"AWS_SECRET_ACCESS_KEY": "def"})
env_vars.start()
# initialising the Session can be tricy since it has to be imported from
# the module/file that creates the session on actual code rather than
# where's a Session code is. In this case you might have to import from
# main rather than boto3.
boto3.session.Session = mock.Mock(side_effect=[
MockClient(region_name='eu-west-1',
aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'])])
s3_client = boto3.client('s3', verify=False)
# When
has_restore_occured = restore_s3_objects(s3_client, bucket, prefix)
# Then
self.assertEqual(has_restore_occured, False) # your expected result set
env_vars.stop()

Pythonic way of using next-page token

A lot of requests now return page-tokens and I was curious what the most appropriate way of dealing with them was?
I have usually just gone with recursion:
def get_followers(user_id,
access_token,
cursor=''):
url = 'https://api.instagram.com/v1/users/%s/followed-by?access_token=%s&cursor=%s' % (my_user,
access_token,
cursor)
print(url)
response = requests.get(url)
json_out = json.loads(response.text)
for user in json_out['data']:
usrname = user['username']
my_followers.add_follower(usrname)
print("Added: %s" % usrname)
try:
cursor = json_out['pagination']['next_url']
get_followers(user_id, access_token, cursor)
except KeyError:
print("Finished with %d followers" % len(my_followers.followers))
return
However, perhaps a:
while True:
..
if condition:
break
Or some other implementation is seen as more efficient/pythonic?
You could change this to a generator function that yields the next follower when the structure is accessed, fetching more data only when necessary.
def get_followers(...)
...
while token is not None:
# fetch data
...
for user in json_out['data']:
yield user
...
# fetch new token
Then, iterate through this generator to apply your data handling. This also has the advantage of separating data acquisition and handling logic.
followers = get_followers(...)
for user in followers:
username = user['username']
my_followers.add_follower(username)
print("Added: %s" % username)
Thanks for the help, I have never used generators apart from the implicit (range vs xrange) distinction. I was curious if in this case it provides any speed/memory advantage?
My final code-snippet:
def get_followers(user_id,
access_token,
cursor=''):
""" Create a generator which will be accessed later to get the usernames of the followers"""
while cursor is not None:
# fetch data
url = 'https://api.instagram.com/v1/users/%s/followed-by?access_token=%s&cursor=%s' % (my_user,
access_token,
cursor)
print(url)
response = requests.get(url)
json_out = json.loads(response.text)
for user in json_out['data']:
yield user
# fetch new token
try:
cursor = json_out['pagination']['next_url']
except KeyError:
print("Finished with %d followers" % len(my_followers.followers))
cursor = None
followers_gen = get_followers(my_user, access_token) # Get followers
for usr in followers_gen:
usrname = usr['username']
my_followers.add_follower(usrname)
print("Added: %s" % usrname)
Compared to not using a generator:
def get_followers(user_id,
access_token,
cursor=''):
""" Create a generator which will be accessed later to get the usernames of the followers"""
while cursor is not None:
# fetch data
url = 'https://api.instagram.com/v1/users/%s/followed-by?access_token=%s&cursor=%s' % (my_user,
access_token,
cursor)
print(url)
response = requests.get(url)
json_out = json.loads(response.text)
for usr in json_out['data']:
usrname = usr['username']
my_followers.add_follower(usrname)
print("Added: %s" % usrname)
# fetch new token
try:
cursor = json_out['pagination']['next_url']
get_followers(user_id, access_token, cursor)
except KeyError:
print("Finished with %d followers" % len(my_followers.followers))
cursor = None
get_followers(my_user, access_token) # Get followers

Python script to harvest tweets to a MongoDb works with users but not hashtags. Any ideas why not?

I'm playing around the Twitter API and am in the process of developing a script to pull all Tweets with a certain hashtag down to a local mongoDB. I have it working fine when I'm downloading tweets from users, but when downloading tweets from a hashtag I get:
return loads(fp.read(),
AttributeError: 'int' object has no attribute 'read'
Can anyone offer their infinite wisdom into how I could get this script to work?
To run, save it as a .py file, cd to the folder and run:
python twitter.py
Code:
__author__ = 'Tom Cusack'
import pymongo
import oauth2 as oauth
import urllib2, json
import sys, argparse, time
def oauth_header(url, consumer, token):
params = {'oauth_version': '1.0',
'oauth_nonce': oauth.generate_nonce(),
'oauth_timestamp': int(time.time()),
}
req = oauth.Request(method = 'GET',url = url, parameters = params)
req.sign_request(oauth.SignatureMethod_HMAC_SHA1(),consumer, token)
return req.to_header()['Authorization'].encode('utf-8')
def main():
### Twitter Settings
numtweets = '32000'
verbose = 'store_true'
retweet = 'store_false'
CONSUMER_KEY = 'M7Xu9Wte0eIZvqhb4G9HnIn3G'
CONSUMER_SECRET = 'c8hB4Qwps2aODQUx7UsyzQuCRifEp3PKu6hPQll8wnJGIhbKgZ'
ACCESS_TOKEN = '3213221313-APuXuNjVMbRbZpu6sVbETbgqkponGsZJVT53QmG'
ACCESS_SECRET = 'BJHrqWC9ed3pA5oDstSMCYcUcz2pYF3DmJ7jcuDe7yxvi'
base_url = url = 'https://api.twitter.com/1.1/search/tweets.json?include_entities=true&count=200&q=#mongodb&include_rts=%s' % (retweet)
oauth_consumer = oauth.Consumer(key = CONSUMER_KEY, secret = CONSUMER_SECRET)
oauth_token = oauth.Token(key = ACCESS_TOKEN, secret = ACCESS_SECRET)
### Mongodb Settings
uri = 'mongodb://127.0.0.1:27017/SARKY'
if uri != None:
try:
conn = pymongo.MongoClient(uri)
print 'Pulling Tweets..'
except:
print 'Error: Unable to connect to DB. Check uri variable.'
return
uri_parts = pymongo.uri_parser.parse_uri(uri)
db = conn[uri_parts['database']]
db['twitter-harvest'].ensure_index('id_str')
### Helper Variables for Harvest
max_id = -1
tweet_count = 0
stream = 0
### Begin Harvesting
while True:
auth = oauth_header(url, oauth_consumer, oauth_token)
headers = {"Authorization": auth}
request = urllib2.Request(url, headers = headers)
try:
stream = urllib2.urlopen(request)
except urllib2.HTTPError, err:
if err.code == 404:
print 'Error: Unknown user. Check --user arg'
return
if err.code == 401:
print 'Error: Unauthorized. Check Twitter credentials'
return
tweet_list = json.load(stream)
if len(tweet_list) == 0:
print 'No tweets to harvest!'
return
if 'errors' in tweet_list:
print 'Hit rate limit, code: %s, message: %s' % (tweets['errors']['code'], tweets['errors']['message'])
return
if max_id == -1:
tweets = tweet_list
else:
tweets = tweet_list[1:]
if len(tweets) == 0:
print 'Finished Harvest!'
return
for tweet in tweets:
max_id = id_str = tweet['id_str']
try:
if tweet_count == numtweets:
print 'Finished Harvest- hit numtweets!'
return
if uri != None:
db[user].update({'id_str':id_str},tweet,upsert = True)
else:
print tweet['text']
tweet_count+=1
if verbose == True and uri != None:
print tweet['text']
except Exception, err:
print 'Unexpected error encountered: %s' %(err)
return
url = base_url + '&max_id=' + max_id
if __name__ == '__main__':
try:
main()
except SystemExit as e:
if e.code == 0:
pass
You initially set stream = 0. When your try...except block catches a HTTP response with a code that isn't 404 or 401, stream is still equal to 0, but your except block doesn't break out of the function.
I'd look more closely at what this response says.

Categories

Resources