I'm writing a tool that utilitizes CloudFormation outputs after a cdk deploy and then sets up the development environment with config files based on those outputs.
At the end of each core infrastructure component (auth, db, webapp, storage, etc.), I have a CfnOutput construct like the following:
cdk.CfnOutput(
self, 'UserPoolID',
value=self.user_pool.user_pool_id,
)
Which outputs something like
Stack.AuthUserPoolIDABC1234 = s1lvgk44ul23ahfd91p4rdngnf
My goal is to get that value (s1lvgk44ul23ahfd91p4rdngnf) into a configuration file config.js, along with other values from other CloudFormation outputs.
So I wrote a wrapper around CfnOutput like the following:
import os
def cfn_output(scope, prefix, name, value):
cdk.CfnOutput(
scope, name,
value=value,
)
# Save name and value to flat files so that we can read them in other processes
os.makedirs('.tmp', exist_ok=True)
with open(os.path.join('.tmp', f'{prefix}{name}.txt'), 'w') as f:
f.write(value)
And so I used it instead of CfnOutput like so:
cfn_output(
scope=self,
prefix='Auth',
name='UserPoolID',
value=self.user_pool.user_pool_id
)
When I run cdk synth, the file generated (.tmp/AuthUserPoolID.txt) has this content:
${Token[TOKEN.249]}
which is obviously not s1lvgk44ul23ahfd91p4rdngnf as a I expected.
Any solutions or workarounds to getting that token resolved into something usable, or perhaps a different solution altogether?
Instead I decided to use the SDK to get the evaluated outputs from the CloudFormation stack.
# Prepare
cloudformation = boto3.client('cloudformation')
stack_name = 'Stack'
# Get stack outputs
res = cloudformation.describe_stacks(StackName=stack_name)
outputs = res['Stacks'][0]['Outputs']
mp = {
'ApiURL': '',
'AuthUserPoolClientID': '',
'AuthUserPoolID': '',
'DatabaseName': '',
'StorageHostingBucketName': '',
'WebappURL': '',
}
# Parse stack output names
for output in outputs:
ok = output['OutputKey']
ov = output['OutputValue']
for k in mp:
if ok.startswith(k):
mp[k] = ov
# Generate config.js data
data = {
'endpoint': mp['ApiURL'],
'userPoolId': mp['AuthUserPoolID'],
'userPoolClientId': mp['AuthUserPoolClientID'],
}
json_data = json.dumps(data, separators=(',', ':'))
text = f'window.config={json_data}'
# Write ac.js
configjs = os.path.join(os.path.dirname(__file__), '../static/config.js')
with open(configjs, 'w') as f:
f.write(text)
Related
I have a Lambda python function that I inherited which searches and reports on installed packages on EC2 instances. It pulls this information from SSM Inventory where the results are output to an S3 bucket. All of the installed packages have specific names until now. Now we need to report on Palo Alto Cortex XDR. The issue I'm facing is that this product includes the version number in the name and we have different versions installed. If I use the exact name (i.e. Cortex XDR 7.8.1.11343) I get reporting on that particular version but not others. I want to use a wild card to do this. I import regex (import re) on line 7 and then I change line 71 to xdr=line['Cortex*']) but it gives me the following error. I'm a bit new to Python and coding so any explanation as to what I'm doing wrong would be helpful.
File "/var/task/SoeSoftwareCompliance/RequiredSoftwareEmail.py", line 72, in build_html
xdr=line['Cortex*'])
import configparser
import logging
import csv
import json
from jinja2 import Template
import boto3
import re
# config
config = configparser.ConfigParser()
config.read("config.ini")
# logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
# #TODO
# refactor common_csv_header so that we use one with variable
# so that we write all content to one template file.
def build_html(account=None,
ses_email_address=None,
recipient_email=None):
"""
:param recipient_email:
:param ses_email_address:
:param account:
"""
account_id = account["id"]
account_alias = account["alias"]
linux_ec2s = []
windows_ec2s = []
ec2s_not_in_ssm = []
excluded_ec2s = []
# linux ec2s html
with open(f"/tmp/{account_id}_linux_ec2s_required_software_report.csv", "r") as fp:
lines = csv.DictReader(fp)
for line in lines:
if line["platform-type"] == "Linux":
item = dict(id=line['instance-id'],
name=line['instance-name'],
ip=line['ip-address'],
ssm=line['amazon-ssm-agent'],
cw=line['amazon-cloudwatch-agent'],
ch=line['cloudhealth-agent'])
# skip compliant linux ec2s where are values are found
compliance_status = not all(item.values())
if compliance_status:
linux_ec2s.append(item)
# windows ec2s html
with open(f"/tmp/{account_id}_windows_ec2s_required_software_report.csv", "r") as fp:
lines = csv.DictReader(fp)
for line in lines:
if line["platform-type"] == "Windows":
item = dict(id=line['instance-id'],
name=line['instance-name'],
ip=line['ip-address'],
ssm=line['Amazon SSM Agent'],
cw=line['Amazon CloudWatch Agent'],
ch=line['CloudHealth Agent'],
mav=line['McAfee VirusScan Enterprise'],
trx=line['Trellix Agent'],
xdr=line['Cortex*'])
# skip compliant windows ec2s where are values are found
compliance_status = not all(item.values())
if compliance_status:
windows_ec2s.append(item)
# ec2s not found in ssm
with open(f"/tmp/{account_id}_ec2s_not_in_ssm.csv", "r") as fp:
lines = csv.DictReader(fp)
for line in lines:
item = dict(name=line['instance-name'],
id=line['instance-id'],
ip=line['ip-address'],
pg=line['patch-group'])
ec2s_not_in_ssm.append(item)
# display or hide excluded ec2s from report
display_excluded_ec2s_in_report = json.loads(config.get("settings", "display_excluded_ec2s_in_report"))
if display_excluded_ec2s_in_report == "true":
with open(f"/tmp/{account_id}_excluded_from_compliance.csv", "r") as fp:
lines = csv.DictReader(fp)
for line in lines:
item = dict(id=line['instance-id'],
name=line['instance-name'],
pg=line['patch-group'])
excluded_ec2s.append(item)
# pass data to html template
with open('templates/email.html') as file:
template = Template(file.read())
# pass parameters to template renderer
html = template.render(
linux_ec2s=linux_ec2s,
windows_ec2s=windows_ec2s,
ec2s_not_in_ssm=ec2s_not_in_ssm,
excluded_ec2s=excluded_ec2s,
account_id=account_id,
account_alias=account_alias)
# consolidated html with multiple tables
tables_html_code = html
client = boto3.client('ses')
client.send_email(
Destination={
'ToAddresses': [recipient_email],
},
Message={
'Body': {
'Html':
{'Data': tables_html_code}
},
'Subject': {
'Charset': 'UTF-8',
'Data': f'SOE | Software Compliance | {account_alias}',
},
},
Source=ses_email_address,
)
print(tables_html_code)
If I understand your problem correctly, you are getting a KeyError exception because Python does not support wildcards out of the box. A csv.DictReader creates a standard Python dictionary for each row in csv. Python's dictionary is just an associative array without pattern matching.
You can implement this by regex, though. If you have a dictionary line and you don't know the full name of a key you are looking for, you can solve it by re.search function.
line = {'Cortex XDR 7.8.1.11343': 'Some value you are looking for'}
val = next(v for k, v in line.items() if re.search('Cortex.+', k))
print(val) # 'Some value you are looking for'
Be aware that this assumes that a line dictionary contains at least one item that matches the 'Cortex.+' pattern and returns the first match. You would have to refactor this a bit to change this.
1. import os - missing in the code
2. def build_html(account=None -> When the account is pass with Nonetype and below error will thrown in account["id"] and account["alias"].
Ex:
Traceback (most recent call last):
File "C:\Users\pro\Documents\project\pywilds.py", line 134, in <module>
build_html(account=None)
File "C:\Users\pro\Documents\project\pywilds.py", line 33, in build_html
account_id = account["id"]
TypeError: 'NoneType' object is not subscriptable
I hope it helps..
I am trying the example from the Google repo:
https://github.com/googleapis/python-documentai/blob/HEAD/samples/snippets/quickstart_sample.py
I have an error:
metadata=[('x-goog-request-params', 'name=projects/my_proj_id/locations/us/processors/my_processor_id'), ('x-goog-api-client', 'gl-python/3.8.10 grpc/1.38.1 gax/1.30.0 gapic/1.0.0')]), last exception: 503 DNS resolution failed for service: https://us-documentai.googleapis.com/v1/
My full code:
from google.cloud import documentai_v1 as documentai
import os
# TODO(developer): Uncomment these variables before running the sample.
project_id= '123456789'
location = 'us' # Format is 'us' or 'eu'
processor_id = '1a23345gh823892' # Create processor in Cloud Console
file_path = 'document.jpg'
os.environ['GRPC_DNS_RESOLVER'] = 'native'
def quickstart(project_id: str, location: str, processor_id: str, file_path: str):
# You must set the api_endpoint if you use a location other than 'us', e.g.:
opts = {}
if location == "eu":
opts = {"api_endpoint": "eu-documentai.googleapis.com"}
client = documentai.DocumentProcessorServiceClient(client_options=opts)
# The full resource name of the processor, e.g.:
# projects/project-id/locations/location/processor/processor-id
# You must create new processors in the Cloud Console first
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}:process"
# Read the file into memory
with open(file_path, "rb") as image:
image_content = image.read()
document = {"content": image_content, "mime_type": "image/jpeg"}
# Configure the process request
request = {"name": name, "raw_document": document}
result = client.process_document(request=request)
document = result.document
document_pages = document.pages
# For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
# Read the text recognition output from the processor
print("The document contains the following paragraphs:")
for page in document_pages:
paragraphs = page.paragraphs
for paragraph in paragraphs:
print(paragraph)
paragraph_text = get_text(paragraph.layout, document)
print(f"Paragraph text: {paragraph_text}")
def get_text(doc_element: dict, document: dict):
"""
Document AI identifies form fields by their offsets
in document text. This function converts offsets
to text snippets.
"""
response = ""
# If a text segment spans several lines, it will
# be stored in different text segments.
for segment in doc_element.text_anchor.text_segments:
start_index = (
int(segment.start_index)
if segment in doc_element.text_anchor.text_segments
else 0
)
end_index = int(segment.end_index)
response += document.text[start_index:end_index]
return response
def main ():
quickstart (project_id = project_id, location = location, processor_id = processor_id, file_path = file_path)
if __name__ == '__main__':
main ()
FYI, on the Google Cloud website it stated that the endpoint is:
https://us-documentai.googleapis.com/v1/projects/123456789/locations/us/processors/1a23345gh823892:process
I can use the web interface to run DocumentAI so it is working. I just have the problem with Python code.
Any suggestion is appreciated.
I would suspect the GRPC_DNS_RESOLVER environment variable to be the root cause. Did you try with the corresponding line commented out? Why was it added in your code?
I am trying to do some custom manipulation of a torch.utils.data.DataLoader in AzureML but cannot get it to instantiate directly from my azureml.core.Datastore :
ws = Workspace( # ... etc ... )
ds = Datastore.get(ws, datastore_name='my_ds')
am = ds.as_mount()
# HOW DO I GET base_path, data_file from am?
dataloader = DataLoader(
ListDataset(base_path, data_file), #... etc...
)
The value of am.path() is "$AZUREML_DATAREFERENCE_my_ds" but I cannot figure out how to go from that to a pathlib.Path as is expected by the constructor to ListDataset. Things I've tried include Path(am.path()) and Path(os.environ[am.path()]) but they don't seem to work.
It's clear that there's some answer, since :
script_params = {
'--base_path': ds.as_mount(),
'--epochs': 30,
'--batch_size' : 16,
'--use_cuda': 'true'
}
torch = PyTorch(source_directory='./',
script_params=script_params,
compute_target=compute_target,
entry_script='train.py',
pip_packages=packages,
use_gpu=True)
seems to create a legit object.
You can perhaps try using the DataPath class. It exposes attributes such as path_on_datastore which might be the path you're looking for.
To construct this class from your DataReference object i.e. variable am; you can use create_from_data_reference() method.
Example:
ds = Datastore.get(ws, datastore_name='my_ds')
am = ds.as_mount()
dp = DataPath().create_from_data_reference(am)
base_path = dp.path_on_datastore
The above code generated an error for me, removing the parentheses after the DataPath instantiation like below made the code run.
ds = Datastore.get(ws, datastore_name='my_ds')
am = ds.as_mount()
dp = DataPath.create_from_data_reference(am)
base_path = dp.path_on_datastore
Thank you for the code snippet, very useful!
SSM — Boto 3 Docs 1.9.64 documentation
get_parameters doesn't list all parameters?
For those who wants to just copy-paste the code:
import boto3
ssm = boto3.client('ssm')
parameters = ssm.describe_parameters()['Parameters']
Beware of the limit of max 50 parameters!
This code will get all parameters, by recursively fetching until there are no more (50 max is returned per call):
import boto3
def get_resources_from(ssm_details):
results = ssm_details['Parameters']
resources = [result for result in results]
next_token = ssm_details.get('NextToken', None)
return resources, next_token
def main()
config = boto3.client('ssm', region_name='us-east-1')
next_token = ' '
resources = []
while next_token is not None:
ssm_details = config.describe_parameters(MaxResults=50, NextToken=next_token)
current_batch, next_token = get_resources_from(ssm_details)
resources += current_batch
print(resources)
print('done')
You can use get_paginator api. find below example, In my use case i had to get all the values of SSM parameter store and wanted to compare it with a string.
import boto3
import sys
LBURL = sys.argv[1].strip()
client = boto3.client('ssm')
p = client.get_paginator('describe_parameters')
paginator = p.paginate().build_full_result()
for page in paginator['Parameters']:
response = client.get_parameter(Name=page['Name'])
value = response['Parameter']['Value']
if LBURL in value:
print("Name is: " + page['Name'] + " and Value is: " + value)
One of the responses from above/below(?) (by Val Lapidas) inspired me to expand it to this (as his solution doesn't get the SSM parameter value, and some other, additional details).
The downside here is that the AWS function client.get_parameters() only allows 10 names per call.
There's one referenced function call in this code (to_pdatetime(...)) that I have omitted - it just takes the datetime value and makes sure it is a "naive" datetime. This is because I am ultimately dumping this data to an Excel file using pandas, which doesn't deal well with timezones.
from typing import List, Tuple
from boto3 import session
from mypy_boto3_ssm import SSMClient
def ssm_params(aws_session: session.Session = None) -> List[dict]:
"""
Return a detailed list of all the SSM parameters.
"""
# -------------------------------------------------------------
#
#
# -------------------------------------------------------------
def get_parameter_values(ssm_client: SSMClient, ssm_details: dict) -> Tuple[list, str]:
"""
Retrieve additional attributes for the SSM parameters contained in the 'ssm_details'
dictionary passed in.
"""
# Get the details
ssm_param_details = ssm_details['Parameters']
# Just the names, ma'am
param_names = [result['Name'] for result in ssm_param_details]
# Get the parames, including the values
ssm_params_with_values = ssm_client.get_parameters(Names=param_names,
WithDecryption=True)
resources = []
result: dict
for result in ssm_params_with_values['Parameters']:
# Get the matching parameter from the `ssm_details` dict since this has some of the fields
# that aren't in the `ssm_params_with_values` returned from "get_arameters".
param_details = next((zz for zz in ssm_param_details if zz.get('Name', None) == result['Name']), {})
param_policy = param_details.get('Policies', None)
if len(param_policy) == 0:
param_policy = None
resources.append({
'Name': result['Name'],
'LastModifiedDate': to_pdatetime(result['LastModifiedDate']),
'LastModifiedUser': param_details.get('LastModifiedUser', None),
'Version': result['Version'],
'Tier': param_details.get('Tier', None),
'Policies': param_policy,
'ARN': result['ARN'],
'DataType': result.get('DataType', None),
'Type': result.get('Type', None),
'Value': result.get('Value', None)
})
next_token = ssm_details.get('NextToken', None)
return resources, next_token
# -------------------------------------------------------------
#
#
# -------------------------------------------------------------
if aws_session is None:
raise ValueError('No session.')
# Create SSM client
aws_ssm_client = aws_session.client('ssm')
next_token = ' '
ssm_resources = []
while next_token is not None:
# The "describe_parameters" call gets a whole lot of info on the defined SSM params,
# except their actual values. Due to this limitation let's call the nested function
# to get the values, and a few other details.
ssm_descriptions = aws_ssm_client.describe_parameters(MaxResults=10,
NextToken=next_token)
# This will get additional details for the params, including values.
current_batch, next_token = get_parameter_values(ssm_client=aws_ssm_client,
ssm_details=ssm_descriptions)
ssm_resources += current_batch
print(f'SSM Parameters: {len(ssm_resources)}')
return ssm_resources
pythonawsboto3amazon-web-services
There's no ListParameters only DescribeParameter, which lists all the paremeters, or you can set filters.
Boto3 Docs Link:
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ssm.html#SSM.Client.describe_parameters
AWS API Documentation Link:
https://docs.aws.amazon.com/systems-manager/latest/APIReference/API_DescribeParameters.html
You can use get_parameters() and get_parameters_by_path().
Use paginators.
paginator = client.get_paginator('describe_parameters')
More information here.
I am not able to find any solution for recusively copying contents from one to another in s3 buckets using boto in python.
suppose a bucket B1 contains has key structure like:
B1/x/*
I want to copy all the objects recursively from key like B/x/* to B/y/*
There is not "directory" in S3. Those "/" separator is just part of object name, that's why boto doesn't have such features. Either write a script to deal with it or use third party tools.
AWS customerapps show s3browser that provide such arbitrary directory copying functionality. The typical free version only spawn two threads to move file, the paid version allow you to specify more threads and run faster.
Or you just write script and use s3.client.copy_object to copy the file to another name, then delete them afterwards. e.g.
import boto3
s3 = boto3.client("s3")
# list_objects_v2() give more info
more_objects=True
found_token = True
while more_objects :
if found_token :
response= s3.list_objects_v2(
Bucket="mybucket",
Prefix="B1/x/",
Delimiter="/")
else:
response= s3.list_objects_v2(
Bucket="mybucket",
ContinuationToken=found_token,
Prefix="B1/x/",
Delimiter="/")
# use copy_object or copy_from
for source in object_list["Contents"]:
raw_name = source["Key"].split("/")[-1]
new_name = "new_structure/{}".format(raw_name)
s3.copy_object(
....
)
# Now check there is more objects to list
if "NextContinuationToken" in response:
found_token = response["NextContinuationToken"]
more_objects = True
else:
more_objects = False
** IMPORTANT NOTES ** : list_object only return maximum 1000 keys per listing, MaxKey will not change the limit. So you must use list_objects_v2 and check whether NextContinuationToken is returned, to make sure the is more object, repeat it until exhausted.
Just trying to build on previous answer:
s3 = boto3.client('s3')
def copyFolderFromS3(pathFrom, bucketTo, locationTo):
response = {}
response['status'] = 'failed'
getBucket = pathFrom.split('/')[2]
location = '/'.join(pathFrom.split('/')[3:])
if pathFrom.startswith('s3://'):
copy_source = { 'Bucket': getBucket, 'Key': location }
uploadKey = locationTo
recursiveCopyFolderToS3(copy_source,bucketTo,uploadKey)
def recursiveCopyFolderToS3(src,uplB,uplK):
more_objects=True
found_token = True
while more_objects:
if found_token:
response = s3.list_objects_v2(
Bucket=src['Bucket'],
Prefix=src['Key'],
Delimiter="/")
else:
response = s3.list_objects_v2(
Bucket=src['Bucket'],
ContinuationToken=found_token,
Prefix=src['Key'],
Delimiter="/")
for source in response["Contents"]:
raw_name = source["Key"].split("/")[-1]
raw_name = raw_name
new_name = os.path.join(uplK,raw_name)
if raw_name.endswith('_$folder$'):
src["Key"] = source["Key"].replace('_$folder$','/')
new_name = new_name.replace('_$folder$','')
recursiveCopyFolderToS3(src,uplB,new_name)
else:
src['Key'] = source["Key"]
s3.copy_object(CopySource=src,Bucket=uplB,Key=new_name)
if "NextContinuationToken" in response:
found_token = response["NextContinuationToken"]
more_objects = True
else:
more_objects = False
Or you an also use the simple awscli which is by default installed on EC2/emr machines.
import subprocess
cmd='aws s3 cp '+path+' '+uploadUrl+' --recursive'
p=subprocess.Popen(cmd, shell=True,stdout=subprocess.PIPE)
p.communicate()
Instead of using boto3, I opt for aws-cli and sh. See the aws s3 cp docs for full list of arguments, which you can include as kwargs in the following (reworked from my own code) which can be used to copy to / from / between S3 buckets and / or local targets:
import sh # also assumes aws-cli has been installed
def s3_cp(source, target, **kwargs):
"""
Copy data from source to target. Include flags as kwargs
such as recursive=True and include=xyz
"""
args = []
for flag_name, flag_value in kwargs.items():
if flag_value is not False: # i.e. --quiet=False means omit --quiet
args.append(f"--{flag_name}")
if flag_value is not True: # i.e. --quiet=True means --quiet
args.append(flag_value)
args += [source, target]
sh.aws("s3", "cp", *args)
bucket to bucket (as per the OP's question):
s3_cp("s3://B1/x/", "s3://B1/y/", quiet=True, recursive=True)
or bucket to local:
s3_cp("s3://B1/x/", "my-local-dir/", quiet=True, recursive=True)
Personally I found that this method gave improved transfer time (of a few GB over 20k small files) from a couple of hours to a few minutes compared to boto3. Perhaps under the hood it's doing some threading or simply opening few connections - but that's just speculation.
Warning: it won't work on Windows.
Related: https://stackoverflow.com/a/46680575/1571593
Another boto3 alternative, using the higher level resource API rather than client:
import os
import boto3
def copy_prefix_within_s3_bucket(
endpoint_url: str,
bucket_name: str,
old_prefix: str,
new_prefix: str,
) -> None:
bucket = boto3.resource(
"s3",
endpoint_url=endpoint_url,
aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
).Bucket(bucket_name)
for obj in bucket.objects.filter(Prefix=old_prefix):
old_key = obj.key
new_key = old_key.replace(old_prefix, new_prefix)
copy_source = {"Bucket": bucket_name, "Key": old_key}
bucket.copy(copy_source, new_key)
if __name__ == "__main__":
copy_prefix_within_s3_bucket(
endpoint_url="my_endpoint_url",
bucket_name="my_bucket_name",
old_prefix="my_old_prefix",
new_prefix="my_new_prefix",
)