How can we list all the parameters in the aws parameter store using Boto3? There is no ssm.list_parameters in boto3 documentation? - python

SSM — Boto 3 Docs 1.9.64 documentation
get_parameters doesn't list all parameters?

For those who wants to just copy-paste the code:
import boto3
ssm = boto3.client('ssm')
parameters = ssm.describe_parameters()['Parameters']
Beware of the limit of max 50 parameters!

This code will get all parameters, by recursively fetching until there are no more (50 max is returned per call):
import boto3
def get_resources_from(ssm_details):
results = ssm_details['Parameters']
resources = [result for result in results]
next_token = ssm_details.get('NextToken', None)
return resources, next_token
def main()
config = boto3.client('ssm', region_name='us-east-1')
next_token = ' '
resources = []
while next_token is not None:
ssm_details = config.describe_parameters(MaxResults=50, NextToken=next_token)
current_batch, next_token = get_resources_from(ssm_details)
resources += current_batch
print(resources)
print('done')

You can use get_paginator api. find below example, In my use case i had to get all the values of SSM parameter store and wanted to compare it with a string.
import boto3
import sys
LBURL = sys.argv[1].strip()
client = boto3.client('ssm')
p = client.get_paginator('describe_parameters')
paginator = p.paginate().build_full_result()
for page in paginator['Parameters']:
response = client.get_parameter(Name=page['Name'])
value = response['Parameter']['Value']
if LBURL in value:
print("Name is: " + page['Name'] + " and Value is: " + value)

One of the responses from above/below(?) (by Val Lapidas) inspired me to expand it to this (as his solution doesn't get the SSM parameter value, and some other, additional details).
The downside here is that the AWS function client.get_parameters() only allows 10 names per call.
There's one referenced function call in this code (to_pdatetime(...)) that I have omitted - it just takes the datetime value and makes sure it is a "naive" datetime. This is because I am ultimately dumping this data to an Excel file using pandas, which doesn't deal well with timezones.
from typing import List, Tuple
from boto3 import session
from mypy_boto3_ssm import SSMClient
def ssm_params(aws_session: session.Session = None) -> List[dict]:
"""
Return a detailed list of all the SSM parameters.
"""
# -------------------------------------------------------------
#
#
# -------------------------------------------------------------
def get_parameter_values(ssm_client: SSMClient, ssm_details: dict) -> Tuple[list, str]:
"""
Retrieve additional attributes for the SSM parameters contained in the 'ssm_details'
dictionary passed in.
"""
# Get the details
ssm_param_details = ssm_details['Parameters']
# Just the names, ma'am
param_names = [result['Name'] for result in ssm_param_details]
# Get the parames, including the values
ssm_params_with_values = ssm_client.get_parameters(Names=param_names,
WithDecryption=True)
resources = []
result: dict
for result in ssm_params_with_values['Parameters']:
# Get the matching parameter from the `ssm_details` dict since this has some of the fields
# that aren't in the `ssm_params_with_values` returned from "get_arameters".
param_details = next((zz for zz in ssm_param_details if zz.get('Name', None) == result['Name']), {})
param_policy = param_details.get('Policies', None)
if len(param_policy) == 0:
param_policy = None
resources.append({
'Name': result['Name'],
'LastModifiedDate': to_pdatetime(result['LastModifiedDate']),
'LastModifiedUser': param_details.get('LastModifiedUser', None),
'Version': result['Version'],
'Tier': param_details.get('Tier', None),
'Policies': param_policy,
'ARN': result['ARN'],
'DataType': result.get('DataType', None),
'Type': result.get('Type', None),
'Value': result.get('Value', None)
})
next_token = ssm_details.get('NextToken', None)
return resources, next_token
# -------------------------------------------------------------
#
#
# -------------------------------------------------------------
if aws_session is None:
raise ValueError('No session.')
# Create SSM client
aws_ssm_client = aws_session.client('ssm')
next_token = ' '
ssm_resources = []
while next_token is not None:
# The "describe_parameters" call gets a whole lot of info on the defined SSM params,
# except their actual values. Due to this limitation let's call the nested function
# to get the values, and a few other details.
ssm_descriptions = aws_ssm_client.describe_parameters(MaxResults=10,
NextToken=next_token)
# This will get additional details for the params, including values.
current_batch, next_token = get_parameter_values(ssm_client=aws_ssm_client,
ssm_details=ssm_descriptions)
ssm_resources += current_batch
print(f'SSM Parameters: {len(ssm_resources)}')
return ssm_resources
pythonawsboto3amazon-web-services

There's no ListParameters only DescribeParameter, which lists all the paremeters, or you can set filters.
Boto3 Docs Link:
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ssm.html#SSM.Client.describe_parameters
AWS API Documentation Link:
https://docs.aws.amazon.com/systems-manager/latest/APIReference/API_DescribeParameters.html

You can use get_parameters() and get_parameters_by_path().

Use paginators.
paginator = client.get_paginator('describe_parameters')
More information here.

Related

Getting all log events while passing optional argument

I am using boto3 api to get all the log events in cloud watch.
The following is my code
import boto3
client = boto3.client("logs")
LOG_GROUP_NAME = "/foo/bar/foo-jobs/foo"
instance_id= "i-somefooid"
log_events = []
response = client.get_log_events(logGroupName=LOG_GROUP_NAME, logStreamName=instance_id, startFromHead=True)
log_events.extend(response["events"])
next_token = response["nextForwardToken"]
while True:
response = client.get_log_events(logGroupName=LOG_GROUP_NAME, logStreamName=instance_id, nextToken=next_token)
log_events.extend(response["events"])
if next_token == response["nextForwardToken"]:
break
next_token = response["nextForwardToken"]
print(log_events)
Using this I am able to print all the log events for a specified instance id but i am not happy that i have to call .get_log_events twice. The reason is because when i make the first call i don't have a nextToken. I only have it after the initial call. Is there a way to simplify this so that i only make the get_log_events call once inside the while True loop.
I would love to hear some suggestions.
import boto3
log_client = boto3.client('logs')
params = {
'logGroupName': "/foo/bar/foo-jobs/foo",
'logStreamName': "i-somefooid"
}
log_events = []
while params.get('nextToken') != '':
response = log_client.get_log_events(**params)
log_events.extend(response['events'])
next_token = response.get('nextToken')
params['nextToken'] = next_token if next_token else ''

How to pass a different argument conditionally

I want to be able to change an argument to a function when a condition is met.
Currently I am doing this which works, but I am repeating the first argument, is there a way to just change the second argument?
credential = './credentials.json'
if os.path.exists(credential):
account = authenticate(client_config=secrets, credentials=credential)
else:
account = authenticate(client_config=secrets, serialize=credential)
An elegant way is to use kwargs:
credential = './credentials.json'
key = "credentials" if os.path.exists(credentials) else "serialize"
auth_kwargs = {"client_config": secrets, key: credential}
account = authenticate(**auth_kwargs)
I think your way is fine too, but you can do this
credential = './credentials.json'
params = {'serialize': credential}
if os.path.exists(credentials):
params['credentials'] = params.pop('serialize')
account = authenticate(client_config=secrets, **params)
You can pass an (unpacked) dictionary to a function:
credential = './credentials.json'
arguments = {'client_config': secrets, 'serialize': credential} # default
if os.path.exists(credentials):
arguments.pop('serialize')
arguments['credentials'] = credential
account = authenticate(**arguments)
There is functools.partial for this:
from functools import partial
credential = './credentials.json'
auth = partial(authenticate, client_config=secrets)
if os.path.exists(credential):
account = auth(credentials=credential)
else:
account = auth(serialize=credential)

How to verify the Signature of a JWT generated by AWS Cognito in Python 3.6?

Here's my script
import urllib.request
import json
import time
from jose import jwk, jwt
from jose.utils import base64url_decode
import base64
region = '....'
userpool_id = '.....'
app_client_id = '...'
keys_url = 'https://cognito-idp.{}.amazonaws.com/{}/.well-known/jwks.json'.format(region, userpool_id)
response = urllib.request.urlopen(keys_url)
keys = json.loads(response.read())['keys']
token = request.headers['Authorization']
print(token)
# get the kid from the headers prior to verification
headers = jwt.get_unverified_headers(request.headers['Authorization'])
kid = headers['kid']
print(kid)
# search for the kid in the downloaded public keys
key_index = -1
for i in range(len(keys)):
if kid == keys[i]['kid']:
key_index = i
break
if key_index == -1:
print('Public key not found in jwks.json')
return False
# construct the public key
public_key = jwk.construct(keys[key_index])
# get the last two sections of the token,
# message and signature (encoded in base64)
message, encoded_signature = str(token).rsplit('.', 1)
# decode the
print('>>encoded signature')
print(encoded_signature)
decoded_signature = base64.b64decode(encoded_signature)
if not public_key.verify(message, decoded_signature):
print('Signature verification failed')
return False
print('Signature successfully verified')
I am always ending up Signature verification failed even though jwt token is generated by a valid legitimate cognito user pool. I've looked at the documentation and it does not really specify the whole verification process.
I see you're using jose, and I'm using pyjwt, but this solution might help you. Most of the bulk code from the bottom comes from the "api-gateway-authorizer-python" blueprint. Note that this is very frail code that will just break if anything is fails, I ended up not using lambda authentication but rather selecting AWS_IAM authentication for my API Gateway with Identity Pools so I never finished it.
This example requires that you install pyjwt and cryptography with pip on your work directory and upload everything as a .zip file.
I'd recommend that you watch this video if you want to consider the AWS_IAM authentication option: https://www.youtube.com/watch?v=VZqG7HjT2AQ
They also have a solution with a more elaborate lambda authorizer implementation in github at: https://github.com/awslabs/aws-serverless-auth-reference-app (they show the link at the beggining of the video) but I don't know about their pip dependencies.
from __future__ import print_function
from jwt.algorithms import RSAAlgorithm
import re
import jwt
import json
import sys
import urllib
region = 'your-region'
userpoolId = 'your-user-pool-id'
appClientId = 'your-app-client-id'
keysUrl = 'https://cognito-idp.{}.amazonaws.com/{}/.well-known/jwks.json'.format(region, userpoolId)
def lambda_handler(event, context):
bearerToken = event['authorizationToken']
methodArn = event['methodArn']
print("Client token: " + bearerToken)
print("Method ARN: " + methodArn)
response = urllib.urlopen(keysUrl)
keys = json.loads(response.read())['keys']
jwtToken = bearerToken.split(' ')[-1]
header = jwt.get_unverified_header(jwtToken)
kid = header['kid']
jwkValue = findJwkValue(keys, kid)
publicKey = RSAAlgorithm.from_jwk(json.dumps(jwkValue))
decoded = decodeJwtToken(jwtToken, publicKey)
print('Decoded token: ' + json.dumps(decoded))
principalId = decoded['cognito:username']
methodArn = event['methodArn'].split(':')
apiGatewayArnTmp = methodArn[5].split('/')
awsAccountId = methodArn[4]
policy = AuthPolicy(principalId, awsAccountId)
policy.restApiId = apiGatewayArnTmp[0]
policy.region = methodArn[3]
policy.stage = apiGatewayArnTmp[1]
#policy.denyAllMethods()
policy.allowAllMethods()
# Finally, build the policy
authResponse = policy.build()
# new! -- add additional key-value pairs associated with the authenticated principal
# these are made available by APIGW like so: $context.authorizer.<key>
# additional context is cached
context = {
'key': 'value', # $context.authorizer.key -> value
'number': 1,
'bool': True
}
# context['arr'] = ['foo'] <- this is invalid, APIGW will not accept it
# context['obj'] = {'foo':'bar'} <- also invalid
authResponse['context'] = context
return authResponse
def findJwkValue(keys, kid):
for key in keys:
if key['kid'] == kid:
return key
def decodeJwtToken(token, publicKey):
try:
decoded=jwt.decode(token, publicKey, algorithms=['RS256'], audience=appClientId)
return decoded
except Exception as e:
print(e)
raise
class HttpVerb:
GET = 'GET'
POST = 'POST'
PUT = 'PUT'
PATCH = 'PATCH'
HEAD = 'HEAD'
DELETE = 'DELETE'
OPTIONS = 'OPTIONS'
ALL = '*'
class AuthPolicy(object):
# The AWS account id the policy will be generated for. This is used to create the method ARNs.
awsAccountId = ''
# The principal used for the policy, this should be a unique identifier for the end user.
principalId = ''
# The policy version used for the evaluation. This should always be '2012-10-17'
version = '2012-10-17'
# The regular expression used to validate resource paths for the policy
pathRegex = '^[/.a-zA-Z0-9-\*]+$'
'''Internal lists of allowed and denied methods.
These are lists of objects and each object has 2 properties: A resource
ARN and a nullable conditions statement. The build method processes these
lists and generates the approriate statements for the final policy.
'''
allowMethods = []
denyMethods = []
# The API Gateway API id. By default this is set to '*'
restApiId = '*'
# The region where the API is deployed. By default this is set to '*'
region = '*'
# The name of the stage used in the policy. By default this is set to '*'
stage = '*'
def __init__(self, principal, awsAccountId):
self.awsAccountId = awsAccountId
self.principalId = principal
self.allowMethods = []
self.denyMethods = []
def _addMethod(self, effect, verb, resource, conditions):
'''Adds a method to the internal lists of allowed or denied methods. Each object in
the internal list contains a resource ARN and a condition statement. The condition
statement can be null.'''
if verb != '*' and not hasattr(HttpVerb, verb):
raise NameError('Invalid HTTP verb ' + verb + '. Allowed verbs in HttpVerb class')
resourcePattern = re.compile(self.pathRegex)
if not resourcePattern.match(resource):
raise NameError('Invalid resource path: ' + resource + '. Path should match ' + self.pathRegex)
if resource[:1] == '/':
resource = resource[1:]
resourceArn = 'arn:aws:execute-api:{}:{}:{}/{}/{}/{}'.format(self.region, self.awsAccountId, self.restApiId, self.stage, verb, resource)
if effect.lower() == 'allow':
self.allowMethods.append({
'resourceArn': resourceArn,
'conditions': conditions
})
elif effect.lower() == 'deny':
self.denyMethods.append({
'resourceArn': resourceArn,
'conditions': conditions
})
def _getEmptyStatement(self, effect):
'''Returns an empty statement object prepopulated with the correct action and the
desired effect.'''
statement = {
'Action': 'execute-api:Invoke',
'Effect': effect[:1].upper() + effect[1:].lower(),
'Resource': []
}
return statement
def _getStatementForEffect(self, effect, methods):
'''This function loops over an array of objects containing a resourceArn and
conditions statement and generates the array of statements for the policy.'''
statements = []
if len(methods) > 0:
statement = self._getEmptyStatement(effect)
for curMethod in methods:
if curMethod['conditions'] is None or len(curMethod['conditions']) == 0:
statement['Resource'].append(curMethod['resourceArn'])
else:
conditionalStatement = self._getEmptyStatement(effect)
conditionalStatement['Resource'].append(curMethod['resourceArn'])
conditionalStatement['Condition'] = curMethod['conditions']
statements.append(conditionalStatement)
if statement['Resource']:
statements.append(statement)
return statements
def allowAllMethods(self):
'''Adds a '*' allow to the policy to authorize access to all methods of an API'''
self._addMethod('Allow', HttpVerb.ALL, '*', [])
def denyAllMethods(self):
'''Adds a '*' allow to the policy to deny access to all methods of an API'''
self._addMethod('Deny', HttpVerb.ALL, '*', [])
def allowMethod(self, verb, resource):
'''Adds an API Gateway method (Http verb + Resource path) to the list of allowed
methods for the policy'''
self._addMethod('Allow', verb, resource, [])
def denyMethod(self, verb, resource):
'''Adds an API Gateway method (Http verb + Resource path) to the list of denied
methods for the policy'''
self._addMethod('Deny', verb, resource, [])
def allowMethodWithConditions(self, verb, resource, conditions):
'''Adds an API Gateway method (Http verb + Resource path) to the list of allowed
methods and includes a condition for the policy statement. More on AWS policy
conditions here: http://docs.aws.amazon.com/IAM/latest/UserGuide/reference_policies_elements.html#Condition'''
self._addMethod('Allow', verb, resource, conditions)
def denyMethodWithConditions(self, verb, resource, conditions):
'''Adds an API Gateway method (Http verb + Resource path) to the list of denied
methods and includes a condition for the policy statement. More on AWS policy
conditions here: http://docs.aws.amazon.com/IAM/latest/UserGuide/reference_policies_elements.html#Condition'''
self._addMethod('Deny', verb, resource, conditions)
def build(self):
'''Generates the policy document based on the internal lists of allowed and denied
conditions. This will generate a policy with two main statements for the effect:
one statement for Allow and one statement for Deny.
Methods that includes conditions will have their own statement in the policy.'''
if ((self.allowMethods is None or len(self.allowMethods) == 0) and
(self.denyMethods is None or len(self.denyMethods) == 0)):
raise NameError('No statements defined for the policy')
policy = {
'principalId': self.principalId,
'policyDocument': {
'Version': self.version,
'Statement': []
}
}
policy['policyDocument']['Statement'].extend(self._getStatementForEffect('Allow', self.allowMethods))
policy['policyDocument']['Statement'].extend(self._getStatementForEffect('Deny', self.denyMethods))
return policy
Following class verifies Cognito tokens. You are required to install jose and pydantic.
The implementation is derived from this repo, it contains more details, addiotional functionalitites, tests etc.
import json
import logging
import os
import time
import urllib.request
from typing import Dict, List
from jose import jwk, jwt
from jose.utils import base64url_decode
from pydantic import BaseModel
class JWK(BaseModel):
"""A JSON Web Key (JWK) model that represents a cryptographic key.
The JWK specification:
https://datatracker.ietf.org/doc/html/rfc7517
"""
alg: str
e: str
kid: str
kty: str
n: str
use: str
class CognitoAuthenticator:
def __init__(self, pool_region: str, pool_id: str, client_id: str) -> None:
self.pool_region = pool_region
self.pool_id = pool_id
self.client_id = client_id
self.issuer = f"https://cognito-idp.{self.pool_region}.amazonaws.com/{self.pool_id}"
self.jwks = self.__get_jwks()
def __get_jwks(self) -> List[JWK]:
"""Returns a list of JSON Web Keys (JWKs) from the issuer. A JWK is a
public key used to verify a JSON Web Token (JWT).
Returns:
List of keys
Raises:
Exception when JWKS endpoint does not contain any keys
"""
file = urllib.request.urlopen(f"{self.issuer}/.well-known/jwks.json")
res = json.loads(file.read().decode("utf-8"))
if not res.get("keys"):
raise Exception("The JWKS endpoint does not contain any keys")
jwks = [JWK(**key) for key in res["keys"]]
return jwks
def verify_token(
self,
token: str,
) -> bool:
"""Verify a JSON Web Token (JWT).
For more details refer to:
https://docs.aws.amazon.com/cognito/latest/developerguide/amazon-cognito-user-pools-using-tokens-verifying-a-jwt.html
Args:
token: The token to verify
Returns:
True if valid, False otherwise
"""
try:
self._is_jwt(token)
self._get_verified_header(token)
self._get_verified_claims(token)
except CognitoError:
return False
return True
def _is_jwt(self, token: str) -> bool:
"""Validate a JSON Web Token (JWT).
A JSON Web Token (JWT) includes three sections: Header, Payload and
Signature. They are base64url encoded and are separated by dot (.)
characters. If JWT token does not conform to this structure, it is
considered invalid.
Args:
token: The token to validate
Returns:
True if valid
Raises:
CognitoError when invalid token
"""
try:
jwt.get_unverified_header(token)
jwt.get_unverified_claims(token)
except jwt.JWTError:
logging.info("Invalid JWT")
raise InvalidJWTError
return True
def _get_verified_header(self, token: str) -> Dict:
"""Verifies the signature of a a JSON Web Token (JWT) and returns its
decoded header.
Args:
token: The token to decode header from
Returns:
A dict representation of the token header
Raises:
CognitoError when unable to verify signature
"""
# extract key ID (kid) from token
headers = jwt.get_unverified_header(token)
kid = headers["kid"]
# find JSON Web Key (JWK) that matches kid from token
key = None
for k in self.jwks:
if k.kid == kid:
# construct a key object from found key data
key = jwk.construct(k.dict())
break
if not key:
logging.info(f"Unable to find a signing key that matches '{kid}'")
raise InvalidKidError
# get message and signature (base64 encoded)
message, encoded_signature = str(token).rsplit(".", 1)
signature = base64url_decode(encoded_signature.encode("utf-8"))
if not key.verify(message.encode("utf8"), signature):
logging.info("Signature verification failed")
raise SignatureError
# signature successfully verified
return headers
def _get_verified_claims(self, token: str) -> Dict:
"""Verifies the claims of a JSON Web Token (JWT) and returns its claims.
Args:
token: The token to decode claims from
Returns:
A dict representation of the token claims
Raises:
CognitoError when unable to verify claims
"""
claims = jwt.get_unverified_claims(token)
# verify expiration time
if claims["exp"] < time.time():
logging.info("Expired token")
raise TokenExpiredError
# verify issuer
if claims["iss"] != self.issuer:
logging.info("Invalid issuer claim")
raise InvalidIssuerError
# verify audience
# note: claims["client_id"] for access token, claims["aud"] otherwise
if claims["client_id"] != self.client_id:
logging.info("Invalid audience claim")
raise InvalidAudienceError
# verify token use
if claims["token_use"] != "access":
logging.info("Invalid token use claim")
raise InvalidTokenUseError
# claims successfully verified
return claims
class CognitoError(Exception):
pass
class InvalidJWTError(CognitoError):
pass
class InvalidKidError(CognitoError):
pass
class SignatureError(CognitoError):
pass
class TokenExpiredError(CognitoError):
pass
class InvalidIssuerError(CognitoError):
pass
class InvalidAudienceError(CognitoError):
pass
class InvalidTokenUseError(CognitoError):
pass
if __name__ == "__main__":
auth = CognitoAuthenticator(
pool_region=os.environ["AWS_COGNITO_REGION"],
pool_id=os.environ["AWS_USER_POOL_ID"],
client_id=os.environ["AWS_USER_POOL_CLIENT_ID"],
)
# note: if you are not using access token, see line 161
access_token = "my_access_token"
print(f"Token verified: {auth.verify_token(access_token)}")

Search via Python Search API timing out intermittently

We have an application that is basically just a form submission for requesting a team drive to be created. It's hosted on Google App Engine.
This timeout error is coming from a single field in the form that simply does typeahead for an email address. All of the names on the domain are indexed in the datastore, about 300k entities - nothing is being pulled directly from the directory api. After 10 seconds of searching (via the Python Google Search API), it will time out. This is currently intermittent, but errors have been increasing in frequency.
Error: line 280, in get_result raise _ToSearchError(e) Timeout: Failed to complete request in 9975ms
Essentially, speeding up the searches will resolve. I looked at the code and I don't believe there is any room for improvement there. I am not sure if increasing the instance class will improve this, it is currently an F2. Or if perhaps there is another way to improve the index efficiency. I'm not entirely sure how one would do that however. Any thoughts would be appreciated.
Search Code:
class LookupUsersorGrpService(object):
'''
lookupUsersOrGrps accepts various params and performs search
'''
def lookupUsersOrGrps(self,params):
search_results_json = {}
search_results = []
directory_users_grps = GoogleDirectoryUsers()
error_msg = 'Technical error'
query = ''
try:
#Default few values if not present
if ('offset' not in params) or (params['offset'] is None):
params['offset'] = 0
else:
params['offset'] = int(params['offset'])
if ('limit' not in params) or (params['limit'] is None):
params['limit'] = 20
else:
params['limit'] = int(params['limit'])
#Search related to field name
query = self.appendQueryParam(q=query, p=params, qname='search_name', criteria=':', pname='query', isExactMatch=True,splitString=True)
#Search related to field email
query = self.appendQueryParam(q=query, p=params, qname='search_email', criteria=':', pname='query', isExactMatch=True, splitString=True)
#Perform search
log.info('Search initialized :\"{}\"'.format(query) )
# sort results by name ascending
expr_list = [search.SortExpression(expression='name', default_value='',direction=search.SortExpression.ASCENDING)]
# construct the sort options
sort_opts = search.SortOptions(expressions=expr_list)
#Prepare the search index
index = search.Index(name= "GoogleDirectoryUsers",namespace="1")
search_query = search.Query(
query_string=query.strip(),
options=search.QueryOptions(
limit=params['limit'],
offset=params['offset'],
sort_options=sort_opts,
returned_fields = directory_users_grps.get_search_doc_return_fields()
))
#Execute the search query
search_result = index.search(search_query)
#Start collecting the values
total_cnt = search_result.number_found
params['limit'] = len(search_result.results)
#Prepare the response object
for teamdriveDoc in search_result.results:
teamdriveRecord = GoogleDirectoryUsers.query(GoogleDirectoryUsers.email==teamdriveDoc.doc_id).get()
if teamdriveRecord:
if teamdriveRecord.suspended == False:
search_results.append(teamdriveRecord.to_dict())
search_results_json.update({"users" : search_results})
search_results_json.update({"limit" : params['limit'] if len(search_results)>0 else '0'})
search_results_json.update({"total_count" : total_cnt if len(search_results)>0 else '0'})
search_results_json.update({"status" : "success"})
except Exception as e:
log.exception("Error in performing search")
search_results_json.update({"status":"failed"})
search_results_json.update({"description":error_msg})
return search_results_json
''' Retrieves the given param from dict and adds to query if exists
'''
def appendQueryParam(self, q='', p=[], qname=None, criteria='=', pname=None,
isExactMatch = False, splitString = False, defaultValue=None):
if (pname in p) or (defaultValue is not None):
if len(q) > 0:
q += ' OR '
q += qname
if criteria:
q += criteria
if defaultValue is None:
val = p[pname]
else:
val = defaultValue
if splitString:
val = val.replace("", "~")[1: -1]
#Helps to retain passed argument as it is, example email
if isExactMatch:
q += "\"" +val + "\""
else:
q += val
return q
An Index instance's search method accepts a deadline parameter, so you could use that to increase the time that you are willing to wait for the search to respond:
search_result = index.search(search_query, deadline=30)
The documentation doesn't specify acceptable value for deadline, but other App Engine services tend to accept values up to 60 seconds.

Recursively copying Content from one path to another of s3 buckets using boto in python

I am not able to find any solution for recusively copying contents from one to another in s3 buckets using boto in python.
suppose a bucket B1 contains has key structure like:
B1/x/*
I want to copy all the objects recursively from key like B/x/* to B/y/*
There is not "directory" in S3. Those "/" separator is just part of object name, that's why boto doesn't have such features. Either write a script to deal with it or use third party tools.
AWS customerapps show s3browser that provide such arbitrary directory copying functionality. The typical free version only spawn two threads to move file, the paid version allow you to specify more threads and run faster.
Or you just write script and use s3.client.copy_object to copy the file to another name, then delete them afterwards. e.g.
import boto3
s3 = boto3.client("s3")
# list_objects_v2() give more info
more_objects=True
found_token = True
while more_objects :
if found_token :
response= s3.list_objects_v2(
Bucket="mybucket",
Prefix="B1/x/",
Delimiter="/")
else:
response= s3.list_objects_v2(
Bucket="mybucket",
ContinuationToken=found_token,
Prefix="B1/x/",
Delimiter="/")
# use copy_object or copy_from
for source in object_list["Contents"]:
raw_name = source["Key"].split("/")[-1]
new_name = "new_structure/{}".format(raw_name)
s3.copy_object(
....
)
# Now check there is more objects to list
if "NextContinuationToken" in response:
found_token = response["NextContinuationToken"]
more_objects = True
else:
more_objects = False
** IMPORTANT NOTES ** : list_object only return maximum 1000 keys per listing, MaxKey will not change the limit. So you must use list_objects_v2 and check whether NextContinuationToken is returned, to make sure the is more object, repeat it until exhausted.
Just trying to build on previous answer:
s3 = boto3.client('s3')
def copyFolderFromS3(pathFrom, bucketTo, locationTo):
response = {}
response['status'] = 'failed'
getBucket = pathFrom.split('/')[2]
location = '/'.join(pathFrom.split('/')[3:])
if pathFrom.startswith('s3://'):
copy_source = { 'Bucket': getBucket, 'Key': location }
uploadKey = locationTo
recursiveCopyFolderToS3(copy_source,bucketTo,uploadKey)
def recursiveCopyFolderToS3(src,uplB,uplK):
more_objects=True
found_token = True
while more_objects:
if found_token:
response = s3.list_objects_v2(
Bucket=src['Bucket'],
Prefix=src['Key'],
Delimiter="/")
else:
response = s3.list_objects_v2(
Bucket=src['Bucket'],
ContinuationToken=found_token,
Prefix=src['Key'],
Delimiter="/")
for source in response["Contents"]:
raw_name = source["Key"].split("/")[-1]
raw_name = raw_name
new_name = os.path.join(uplK,raw_name)
if raw_name.endswith('_$folder$'):
src["Key"] = source["Key"].replace('_$folder$','/')
new_name = new_name.replace('_$folder$','')
recursiveCopyFolderToS3(src,uplB,new_name)
else:
src['Key'] = source["Key"]
s3.copy_object(CopySource=src,Bucket=uplB,Key=new_name)
if "NextContinuationToken" in response:
found_token = response["NextContinuationToken"]
more_objects = True
else:
more_objects = False
Or you an also use the simple awscli which is by default installed on EC2/emr machines.
import subprocess
cmd='aws s3 cp '+path+' '+uploadUrl+' --recursive'
p=subprocess.Popen(cmd, shell=True,stdout=subprocess.PIPE)
p.communicate()
Instead of using boto3, I opt for aws-cli and sh. See the aws s3 cp docs for full list of arguments, which you can include as kwargs in the following (reworked from my own code) which can be used to copy to / from / between S3 buckets and / or local targets:
import sh # also assumes aws-cli has been installed
def s3_cp(source, target, **kwargs):
"""
Copy data from source to target. Include flags as kwargs
such as recursive=True and include=xyz
"""
args = []
for flag_name, flag_value in kwargs.items():
if flag_value is not False: # i.e. --quiet=False means omit --quiet
args.append(f"--{flag_name}")
if flag_value is not True: # i.e. --quiet=True means --quiet
args.append(flag_value)
args += [source, target]
sh.aws("s3", "cp", *args)
bucket to bucket (as per the OP's question):
s3_cp("s3://B1/x/", "s3://B1/y/", quiet=True, recursive=True)
or bucket to local:
s3_cp("s3://B1/x/", "my-local-dir/", quiet=True, recursive=True)
Personally I found that this method gave improved transfer time (of a few GB over 20k small files) from a couple of hours to a few minutes compared to boto3. Perhaps under the hood it's doing some threading or simply opening few connections - but that's just speculation.
Warning: it won't work on Windows.
Related: https://stackoverflow.com/a/46680575/1571593
Another boto3 alternative, using the higher level resource API rather than client:
import os
import boto3
def copy_prefix_within_s3_bucket(
endpoint_url: str,
bucket_name: str,
old_prefix: str,
new_prefix: str,
) -> None:
bucket = boto3.resource(
"s3",
endpoint_url=endpoint_url,
aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
).Bucket(bucket_name)
for obj in bucket.objects.filter(Prefix=old_prefix):
old_key = obj.key
new_key = old_key.replace(old_prefix, new_prefix)
copy_source = {"Bucket": bucket_name, "Key": old_key}
bucket.copy(copy_source, new_key)
if __name__ == "__main__":
copy_prefix_within_s3_bucket(
endpoint_url="my_endpoint_url",
bucket_name="my_bucket_name",
old_prefix="my_old_prefix",
new_prefix="my_new_prefix",
)

Categories

Resources