I want to use AWS Spot instances to train Neural Networks. To prevent loss of the model when the spot instance is terminated, I plan to create a snapshot of the EBS volume, make a new volume and attach it to a reserved instance. How can I mount, or make the EBS volume available using python & boto3.
These are the steps used to make the volume available on Linux, but I want to automate the process so that I don't need to SSH into the instance every time. Here is the code I use to attach the volume -
import boto3
ec2 = boto3.resource('ec2')
spot = ec2.Instance('i-9a8f5082')
res = ec2.Instance('i-86e65a13')
snapshot = ec2.create_snapshot(VolumeId="vol-5315f7db", Description="testing spot instances")
volume = ec2.create_volume(SnapshotId=snapshot.id, AvailabilityZone='us-west-2a')
res.attach_volume(VolumeId="vol-5315f7db", Device='/dev/sdy')
snapshot.delete()
You need to run mount command on instance. 2 way for it. One is the sending command with a ssh connection like #mootmoot wrote. The other one is the sending command with AWS SSM service like #Mark B wrote. Here is the detailed SSM solution sample, you can ignore unnecessary parts for you:
Send bash command to instances using AWS SSM:
# Amazon EC2 Systems Manager requires
# 1. An IAM role for EC2 instances that will process commands. There should be a system manager role and the instance should use this role ! (Did it while creation instance)
# 2. And a separate role for users executing commands. Aws IAM user that has access and secret keys should have ssm permission. (i.e. AmazonSSMFullAccess)
# http://docs.aws.amazon.com/systems-manager/latest/userguide/sysman-configuring-access-policies.html
def execute_commands_on_linux_instances(commands, instance_ids):
client = boto3.client('ssm', **conn_args) # Need your credentials here
all_ssm_enabled_instances, ssm_enabled_instances, not_worked_instances = [],[],[]
not_worked_instances = instance_ids.copy()
all_ssm_enabled_instances = list()
outputs = list({})
not_executed = list()
# Select only the Instances that have an active ssm agent.
if len(client.describe_instance_information()['InstanceInformationList']) > 0:
resp = client.describe_instance_information(MaxResults=20)['InstanceInformationList']
for ins in resp:
all_ssm_enabled_instances.append(ins['InstanceId'])
ssm_enabled_instances = list(set(all_ssm_enabled_instances).intersection(instance_ids))
not_worked_instances = list(set(instance_ids).difference(all_ssm_enabled_instances))
# Now, send the command !
resp = client.send_command(
DocumentName="AWS-RunShellScript",
Parameters={'commands': [commands]},
InstanceIds=ssm_enabled_instances,
)
# get the command id generated by the send_command
com_id = resp['Command']['CommandId']
# Wait until all the commands status are out of Pending and InProgress
list_comm = client.list_commands( CommandId=com_id)
while True:
list_comm = client.list_commands( CommandId=com_id)
if (list_comm['Commands'][0]['Status'] == 'Pending'or list_comm['Commands'][0]['Status'] == 'InProgress'):
continue
else:
# Commands on all Instances were executed
break
# Get the responses the instances gave to this command. (stdoutput and stderror)
# Althoug the command could arrive to instance, if it couldn't be executed by the instance (response -1) it will ignore.
for i in ssm_enabled_instances:
resp2 = client.get_command_invocation(CommandId=com_id, InstanceId=i)
if resp2['ResponseCode'] == -1:
not_executed.append(i)
else:
outputs.append({'ins_id': i, 'stdout': resp2['StandardOutputContent'],
'stderr': resp2['StandardErrorContent']})
# Remove the instance that couldn't execute the command ever, add it to not_worked_instances
ssm_enabled_instances = list(set(ssm_enabled_instances).difference(not_executed))
not_worked_instances.extend(not_executed)
return ssm_enabled_instances, not_worked_instances, outputs
else:
print("There is no any available instance that has a worked SSM service!")
return ssm_enabled_instances, not_worked_instances, outputs
Create Instances with required IAM Instance profile that has required role that has required policy. As a result of this instance creation, instances have running SSM agents:
def create_ec2_instance(node_type):
# define userdata to be run at instance launch
userdata = """#cloud-config
runcmd:
- cd /tmp
- sudo yum install -y https://s3.amazonaws.com/ec2-downloads-windows/SSMAgent/latest/linux_amd64/amazon-ssm-agent.rpm
"""
ec2_r = boto3.resource('ec2', **conn_args)
rolename = "amazonec2ssmrole"
i_pro_name = "ins_pro_for_ssm"
# Create an iam instance profile and add required role to this instance profile.
# Create a role and attach a policy to it if not exist.
# Instances will have this role to build ssm (ec2 systems manager) connection.
iam = boto3.resource('iam', **conn_args)
try:
response= iam.meta.client.get_instance_profile(InstanceProfileName=i_pro_name)
except:
iam.create_instance_profile(InstanceProfileName=i_pro_name)
try:
response = iam.meta.client.get_role(RoleName=rolename)
except:
iam.create_role(
AssumeRolePolicyDocument='{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"Service":["ec2.amazonaws.com"]},"Action":["sts:AssumeRole"]}]}',
RoleName=rolename)
role = iam.Role(rolename)
role.attach_policy(PolicyArn='arn:aws:iam::aws:policy/service-role/AmazonEC2RoleforSSM')
iam.meta.client.add_role_to_instance_profile(InstanceProfileName=i_pro_name, RoleName=rolename)
iam_ins_profile = {'Name': i_pro_name}
if node_type == "Medium":
instance = ec2_r.create_instances(
ImageId='ami-aa5ebdd2',
MinCount=1,
MaxCount=1,
UserData=userdata,
InstanceType='t2.medium',
KeyName=key_pair_name,
IamInstanceProfile=iam_ins_profile,
BlockDeviceMappings=[{"DeviceName": "/dev/xvda", "Ebs": {"VolumeSize": 20}}])
elif node_type == "Micro":
instance = ec2_r.create_instances(
ImageId='ami-aa5ebdd2',
MinCount=1,
MaxCount=1,
UserData=userdata,
InstanceType='t2.micro',
KeyName=key_pair_name,
IamInstanceProfile=iam_ins_profile,
BlockDeviceMappings=[{"DeviceName": "/dev/xvda", "Ebs": {"VolumeSize": 10}}])
else:
print("Node Type Error")
return -1
# Wait for the instance state, default --> one wait is 15 seconds, 40 attempts
print('Waiting for instance {0} to switch to running state'.format(instance[0].id))
waiter = ec2_r.meta.client.get_waiter('instance_running')
waiter.wait(InstanceIds=[instance[0].id])
instance[0].reload()
print('Instance is running, public IP: {0}'.format(instance[0].public_ip_address))
return instance[0].id
Don't forget giving ssm permission. (i.e. AmazonSSMFullAccess) to the Aws IAM user that has access and secret keys.
By the way, conn_args can be defined as follows:
conn_args = {
'aws_access_key_id': Your_Access_Key,
'aws_secret_access_key': Your_Secret_Key,
'region_name': 'us-west-2'
}
You have to perform those steps in the operating system. You can't perform those steps via the AWS API (Boto3). Your best bet is to script those steps and then kick off the script somehow via Boto3, possibly using the AWS SSM service.
What's wrong with sending and execute ssh script remotely? Assume you are using ubuntu , i.e.
ssh -i your.pem ubuntu#ec2_name_or_ip 'sudo bash -s' < mount_script.sh
If you attach tag to those resources, you can later use boto3 to inquired the resources by universal tag name, instead tied to the specific static id.
Related
I'm trying to write a script in python 3.8 in cloud functions to stop all of the instances (VM's) no matter about the region, instance name etc. Moreover I'm looking for Tag specified too. However I didn't found an answer anywhere, everywhere it's said I need to give project id, region and instance name. Is there any option to jump over it?
Use the aggregatedList() and aggregatedList_next() methods to list all instances in all zones. Use the stop() method to terminate an instance. To understand the data returned by aggregatedList(), study the REST API response body.
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
credentials = GoogleCredentials.get_application_default()
service = discovery.build('compute', 'v1', credentials=credentials)
# Project ID for this request.
project = "REPLACE_ME"
request = service.instances().aggregatedList(project=project)
while request is not None:
response = request.execute()
instances = response.get('items', {})
for instance in instances.values():
for i in instance.get('instances', []):
# Do something here to determine if this instance should be stopped.
# Stop instance
response = service.instances().stop(project=project, zone=zone, instance=i)
# Add code to check the response, see below
request = service.instances().aggregatedList_next(previous_request=request, previous_response=response)
Example code to check the response status returned by stop(). You might want to stop all instances and save each response in a list and then process the list until all instances have stopped.
while True:
result = service.zoneOperations().get(
project=project,
zone=zone,
operation=response['name']).execute()
print('status:', result['status'])
if result['status'] == 'DONE':
print("done.")
break;
if 'error' in result:
raise Exception(result['error'])
time.sleep(1)
I need a script in Python to get all ACL for each files in a s3 bucket, to see if there are public o private files in that bucket. All files are images, and Marketing dept wanna know which files are Private.
Something like this
get_acl(object, bucket, ...)
But recursive for all 10.000 files in that bucket.
With the AWS CLI i cant get this work, any idea where i can find some examples?
Thanks
As you state, you need to list all of the objects in the bucket, and either check their ACL, or test to see if you can access the object without authentication.
If you want to check the ACLs, you can run through each object in turn and check:
BUCKET = "example-bucket"
import boto3
s3 = boto3.client('s3')
paginator = s3.get_paginator('list_objects_v2')
# List all of the objects
for page in paginator.paginate(Bucket=BUCKET):
for cur in page.get("Contents", []):
# Get the ACL for each object in turn
# Note: This example does not take into
# account any bucket-level permissions
acl = s3.get_object_acl(Bucket=BUCKET, Key=cur['Key'])
public_read = False
public_write = False
# Check each grant in the ACL
for grant in acl["Grants"]:
# See if the All Users group has been given a right, keep track of
# all possibilites in case there are multiple rules for some reason
if grant["Grantee"].get("URI", "") == "http://acs.amazonaws.com/groups/global/AllUsers":
if grant["Permission"] in {"READ", "FULL_CONTROL"}:
public_read = True
if grant["Permission"] in {"WRITE", "FULL_CONTROL"}:
public_read = True
# Write out the status for this object
if public_read and public_write:
status = "public_read_write"
elif public_read:
status = "public_read"
elif public_write:
status = "public_write"
else:
status = "private"
print(f"{cur['Key']},{status}")
When the objects in the bucket are public you should get a 200 code, but if they are private the code will be 403.
So what you could try first is to get the list of all the objects in your bucket:
aws2 s3api list-objects --bucket bucketnamehere
So in python you could iterate a request to each of the objects, example:
https://bucketname.s3.us-east-1.amazonaws.com/objectname
You can do the test with the Unix command line Curl
curl -I https://bucketname.s3.us-east-1.amazonaws.com/objectname
I am messing with the example code Google provides and I am looking to determine when an instance is ready to do work. They have an operation 'DONE' status and they have an instance 'RUNNING' status, but there is still a delay until I can actually use the instance. What is the best way to wait for this without waiting for a set time period (because that is a waste for time if the instance is ready sooner)?
I modified their wait_for_operation function so it uses isUp:
# [START wait_for_operation]
def wait_for_operation(compute, project, zone, operation):
print('Waiting for operation to finish...')
while True:
result = compute.zoneOperations().get(
project=project,
zone=zone,
operation=operation).execute()
if result['status'] == 'DONE':
print("done.")
print("result:")
print(result)
if 'error' in result:
raise Exception(result['error'])
print("before ex")
instStatus = compute.instances().get(
project=project,
zone=zone,
instance='inst-test1').execute()
print("after ex")
if instStatus['status'] == 'RUNNING':
if isUp("10.xxx.xx.xx"):
print("instStatus = ")
print(instStatus)
return result
else:
print("wasn't replying to ping")
time.sleep(1)
# [END wait_for_operation]
def isUp(hostname):
giveFeedback = False
if platform.system() == "Windows":
response = os.system("ping "+hostname+" -n 1")
else:
response = os.system("ping -c 1 " + hostname)
isUpBool = False
if response == 0:
if giveFeedback:
print( hostname + 'is up!')
isUpBool = True
else:
if giveFeedback:
print( hostname + 'is down!')
return isUpBool
See Matthew's answer for original isUp code: Pinging servers in Python
Most of the other code originated here:
https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/compute/api/create_instance.py
GCP status link:
https://cloud.google.com/compute/docs/instances/checking-instance-status
My code works, but is there a better way using instance status or something and avoiding the entire isUp/ping stuff? Seems like my method is a needless workaround.
Obviously I am using Python and this is just messing around code with needless prints etc.
I have a Windows 7 workstation and I don't want to have to require admin rights and a Linux instance.
Edit 1: "by ready to do work", I mean I can send commands to it and it will respond.
Hi I would suggest you to use global operations.
https://cloud.google.com/compute/docs/reference/rest/beta/globalOperations/get
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
credentials = GoogleCredentials.get_application_default()
service = discovery.build('compute', 'beta', credentials=credentials)
# Project ID for this request.
project = 'my-project' # TODO: Update placeholder value.
# Name of the Operations resource to return.
operation = 'my-operation' # TODO: Update placeholder value.
request = service.globalOperations().get(project=project, operation=operation)
response = request.execute()
# TODO: Change code below to process the `response` dict:
pprint(response)
One approach I have used is having a startup script create a field in the instance metadata. You can then check the instance status using your code about and see if the new metadata has been added. You can then avoid pinging the server. An added benefit is if there is not an external ip for the instance, this method still works.
instStatus = compute.instances().get(
project=project,
zone=zone,
instance='inst-test1').execute()
I'm using Docker with AWS ECR repo. One of the steps they instruct you to do is to run "docker tag" to tag a built image with a tag that includes a "fully-ish qualified" location of where the image is going to be stored in ECR.
I was working on migrating a script I had to Python API (instead of doing shell calls to the docker client). I'm unable to find the equivalent of "docker tag" in the API docs at https://docker-py.readthedocs.io/en/stable/images.html.
Can somebody point me in the right direction?
For those of you using ECR/ECS in AWS, here is an example of how you go about this.
Amazon provides instructions like this in ECR to push your images:
aws ecr get-login --no-include-email --region us-west-2
docker build -t myproj .
docker tag calclab:latest XXXXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/myproj:latest
docker push XXXXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/myproj:latest
Here is the rough equivalent using the Docker Python API and Boto (AWS's Python library). It includes tagging the image twice in ECR, so that I can track each image's version number while tracking what the latest is (so my ECS Task can, by default, always grab the most current image)
import docker
import boto3
def ecrDemo(version_number):
# The ECR Repository URI
repo = XXXXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/myproj
# The name of the [profile] stored in .aws/credentials
profile = "sandbox"
# The region your ECR repo is in
region = "us-west-2"
# How you want to tag your project locally
local_tag = "myproj"
#Set up a session
session = boto3.Session(profile_name=profile, region_name=region)
ecr = session.client('ecr')
docker_api = docker.APIClient()
print "Building image " + local_tag
for line in docker_api.build(path='.', tag=local_tag, stream=True, \
dockerfile='./Dockerfile.myproj'):
process_docker_api_line(line)
# Make auth call and parse out results
auth = ecr.get_authorization_token()
token = auth["authorizationData"][0]["authorizationToken"]
username, password = b64decode(token).split(':')
endpoint = auth["authorizationData"][0]["proxyEndpoint"]
# print "Make authentication call"
# docker_api.login(username=user, password=password, \
# registry=endpoint, reauth=True)
auth_config_payload = {'username': username, 'password': password}
version_tag = repo + ':latest'
latest_tag = repo + ':' + version_number
print "Tagging version " + version_tag
if docker_api.tag(local_tag, version_tag) is False:
raise RuntimeError("Tag appeared to fail: " + version_tag)
print "Tagging latest " + latest_tag
if docker_api.tag(local_tag, latest_tag) is False:
raise RuntimeError("Tag appeared to fail: " + tag_latest)
print "Pushing to repo " + version_tag
for line in docker_api.push(version_tag, stream=True, auth_config=auth_config_payload):
self.process_docker_api_line(line)
print "Pushing to repo " + latest_tag
for line in docker_api.push(latest_tag, stream=True, auth_config=auth_config_payload):
self.process_docker_api_line(line)
print "Removing taged deployment images"
# You will still have the local_tag image if you need to troubleshoot
docker_api.remove_image(version_tag, force=True)
docker_api.remove_image(latest_tag, force=True)
def process_docker_api_line(payload):
""" Process the output from API stream, throw an Exception if there is an error """
# Sometimes Docker sends to "{}\n" blocks together...
for segment in payload.split('\n'):
line = segment.strip()
if line:
try:
line_payload = json.loads(line)
except ValueError as ex:
print "Could not decipher payload from API: " + ex.message
if line_payload:
if "errorDetail" in line_payload:
error = line_payload["errorDetail"]
sys.stderr.write(error["message"])
raise RuntimeError("Error on build - code " + `error["code"]`)
elif "stream" in line_payload:
sys.stdout.write(line_payload["stream"])
These are the steps you can use to build and tag an image.
import docker
tag = 'latest' # or whatever you want
client = docker.from_env()
# identifier of the image on your system
dockername = "%s:%s" % (<name of the image on your system>, <tag>)
# the target identifier
target = "%s:%d/%s" % (<registry address>, <registry_port>, <id or name of the image>)
# the url is usually unix://var/run/docker.sock' but depends on your environment
cli = docker.APIClient(base_url="<the daemon\'s url or socket>")
# build the image
cli.build(path=..., tag=dockername, pull=..., buildargs=...)
# tag it
image = client.images.get(dockername)
image.tag(target, tag=tag)
This answer helped me a great deal, thank you!
When I was developing my flow, I was confused by the terminology on the docker-py page, which I think would benefit by more examples.
I was not sure during development if I was correctly building, or if I was having issues with authentication or authorization.
I needed to carefully monitor my build results using the Docker CLI, and capture and analyze the output from the different build, tag and push functions.
Another caveat with getting output from these functions is a warning that is clearly stated for the docker-py pull() function but not the others: if you request that the function provide a generator for output from the operation you must consume that generator. I was able to get my flow working with a debug level of verbosity.
Unfortunately, when I toggled off the verbosity in my code and did not consume the generators for build() and push() (tag() only has a boolean result), my flow only appeared to work: it was not throwing errors, but it also was not building or pushing the code! It is better to either not turn on the streaming output if you're not in debug mode, or leave it on and use deque() to consume the output without processing it.
To summarize the differences in how tags are used:
build() takes a 'local tag', which is just the name of the build, e.g. 'myproj'
tag() applies a 'version tag' to a 'local tag' you just built with build(), the version tag will include the registry and a version label, e.g., myregistry.mydomain.com/myname/myproj:latest
push() will push the image in a 'version tag' to the registry designated in the version tag. So in this case, the image you tagged as myregistry.mydomain.com/myname/myproj:latest will be pushed to the registry myregistry.mydomain.com.
I have to start a new machine with ec2.run_instances in a given subnet but also to have a public ip auto assigned (not fixed elastic ip).
When one starts a new machine from the Amazon's web EC2 Manager via the Request Instance (Instance details) there is a check-box called Assign Public IP to Auto-assign Public IP.
See it highlighted in the screenshot:
How can I achieve that check-box functionality with boto?
Interestingly enough, seems that not many people had this problem. For me was very important to be able to do this right. Without this functionality one is not able to reach out to the internet from instances that are launched into a nondefault subnet.
The boto documentation provided no help, there was a related bug recently fixed, see at: https://github.com/boto/boto/pull/1705.
It's important to note that the subnet_id and the security groups have to be provided to the network interface NetworkInterfaceSpecification instead of run_instance.
import time
import boto
import boto.ec2.networkinterface
from settings.settings import AWS_ACCESS_GENERIC
ec2 = boto.connect_ec2(*AWS_ACCESS_GENERIC)
interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(subnet_id='subnet-11d02d71',
groups=['sg-0365c56d'],
associate_public_ip_address=True)
interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface)
reservation = ec2.run_instances(image_id='ami-a1074dc8',
instance_type='t1.micro',
#the following two arguments are provided in the network_interface
#instead at the global level !!
#'security_group_ids': ['sg-0365c56d'],
#'subnet_id': 'subnet-11d02d71',
network_interfaces=interfaces,
key_name='keyPairName')
instance = reservation.instances[0]
instance.update()
while instance.state == "pending":
print instance, instance.state
time.sleep(5)
instance.update()
instance.add_tag("Name", "some name")
print "done", instance
boto3 has NetworkInterfaces you can configure for DeviceIndex=0, and the Subnet and SecurityGroupIds should be moved from instance level to this block instead. Here's a working version for me,
def launch_instance(ami_id, name, type, size, ec2):
rc = ec2.create_instances(
ImageId=ami_id,
MinCount=1,
MaxCount=1,
KeyName=key_name,
InstanceType=size,
NetworkInterfaces=[
{
'DeviceIndex': 0,
'SubnetId': subnet,
'AssociatePublicIpAddress': True,
'Groups': sg
},
]
)
instance_id = rc[0].id
instance_name = name + '-' + type
ec2.create_tags(
Resources = [instance_id],
Tags = [{'Key': 'Name', 'Value': instance_name}]
)
return (instance_id, instance_name)
Never worked with this feature myself, but the run_instances call has a parameter called network_interfaces. According to the documentation you can give IP address detailes there.