Listing contents of a bucket with boto3 - python

How can I see what's inside a bucket in S3 with boto3? (i.e. do an "ls")?
Doing the following:
import boto3
s3 = boto3.resource('s3')
my_bucket = s3.Bucket('some/path/')
returns:
s3.Bucket(name='some/path/')
How do I see its contents?

One way to see the contents would be:
for my_bucket_object in my_bucket.objects.all():
print(my_bucket_object)

This is similar to an 'ls' but it does not take into account the prefix folder convention and will list the objects in the bucket. It's left up to the reader to filter out prefixes which are part of the Key name.
In Python 2:
from boto.s3.connection import S3Connection
conn = S3Connection() # assumes boto.cfg setup
bucket = conn.get_bucket('bucket_name')
for obj in bucket.get_all_keys():
print(obj.key)
In Python 3:
from boto3 import client
conn = client('s3') # again assumes boto.cfg setup, assume AWS S3
for key in conn.list_objects(Bucket='bucket_name')['Contents']:
print(key['Key'])

I'm assuming you have configured authentication separately.
import boto3
s3 = boto3.resource('s3')
my_bucket = s3.Bucket('bucket_name')
for file in my_bucket.objects.all():
print(file.key)

My s3 keys utility function is essentially an optimized version of #Hephaestus's answer:
import boto3
s3_paginator = boto3.client('s3').get_paginator('list_objects_v2')
def keys(bucket_name, prefix='/', delimiter='/', start_after=''):
prefix = prefix.lstrip(delimiter)
start_after = (start_after or prefix) if prefix.endswith(delimiter) else start_after
for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix, StartAfter=start_after):
for content in page.get('Contents', ()):
yield content['Key']
In my tests (boto3 1.9.84), it's significantly faster than the equivalent (but simpler) code:
import boto3
def keys(bucket_name, prefix='/', delimiter='/'):
prefix = prefix.lstrip(delimiter)
bucket = boto3.resource('s3').Bucket(bucket_name)
return (_.key for _ in bucket.objects.filter(Prefix=prefix))
As S3 guarantees UTF-8 binary sorted results, a start_after optimization has been added to the first function.

In order to handle large key listings (i.e. when the directory list is greater than 1000 items), I used the following code to accumulate key values (i.e. filenames) with multiple listings (thanks to Amelio above for the first lines). Code is for python3:
from boto3 import client
bucket_name = "my_bucket"
prefix = "my_key/sub_key/lots_o_files"
s3_conn = client('s3') # type: BaseClient ## again assumes boto.cfg setup, assume AWS S3
s3_result = s3_conn.list_objects_v2(Bucket=bucket_name, Prefix=prefix, Delimiter = "/")
if 'Contents' not in s3_result:
#print(s3_result)
return []
file_list = []
for key in s3_result['Contents']:
file_list.append(key['Key'])
print(f"List count = {len(file_list)}")
while s3_result['IsTruncated']:
continuation_key = s3_result['NextContinuationToken']
s3_result = s3_conn.list_objects_v2(Bucket=bucket_name, Prefix=prefix, Delimiter="/", ContinuationToken=continuation_key)
for key in s3_result['Contents']:
file_list.append(key['Key'])
print(f"List count = {len(file_list)}")
return file_list

If you want to pass the ACCESS and SECRET keys (which you should not do, because it is not secure):
from boto3.session import Session
ACCESS_KEY='your_access_key'
SECRET_KEY='your_secret_key'
session = Session(aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')
your_bucket = s3.Bucket('your_bucket')
for s3_file in your_bucket.objects.all():
print(s3_file.key)

#To print all filenames in a bucket
import boto3
s3 = boto3.client('s3')
def get_s3_keys(bucket):
"""Get a list of keys in an S3 bucket."""
resp = s3.list_objects_v2(Bucket=bucket)
for obj in resp['Contents']:
files = obj['Key']
return files
filename = get_s3_keys('your_bucket_name')
print(filename)
#To print all filenames in a certain directory in a bucket
import boto3
s3 = boto3.client('s3')
def get_s3_keys(bucket, prefix):
"""Get a list of keys in an S3 bucket."""
resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
for obj in resp['Contents']:
files = obj['Key']
print(files)
return files
filename = get_s3_keys('your_bucket_name', 'folder_name/sub_folder_name/')
print(filename)
Update:
The most easiest way is to use awswrangler
import awswrangler as wr
wr.s3.list_objects('s3://bucket_name')

import boto3
s3 = boto3.resource('s3')
## Bucket to use
my_bucket = s3.Bucket('city-bucket')
## List objects within a given prefix
for obj in my_bucket.objects.filter(Delimiter='/', Prefix='city/'):
print obj.key
Output:
city/pune.csv
city/goa.csv

A more parsimonious way, rather than iterating through via a for loop you could also just print the original object containing all files inside your S3 bucket:
session = Session(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
s3 = session.resource('s3')
bucket = s3.Bucket('bucket_name')
files_in_s3 = bucket.objects.all()
#you can print this iterable with print(list(files_in_s3))

So you're asking for the equivalent of aws s3 ls in boto3. This would be listing all the top level folders and files. This is the closest I could get; it only lists all the top level folders. Surprising how difficult such a simple operation is.
import boto3
def s3_ls():
s3 = boto3.resource('s3')
bucket = s3.Bucket('example-bucket')
result = bucket.meta.client.list_objects(Bucket=bucket.name,
Delimiter='/')
for o in result.get('CommonPrefixes'):
print(o.get('Prefix'))

ObjectSummary:
There are two identifiers that are attached to the ObjectSummary:
bucket_name
key
boto3 S3: ObjectSummary
More on Object Keys from AWS S3 Documentation:
Object Keys:
When you create an object, you specify the key name, which uniquely identifies the object in the bucket. For example, in the Amazon S3 console (see AWS Management Console), when you highlight a bucket, a list of objects in your bucket appears. These names are the object keys. The name for a key is a sequence of Unicode characters whose UTF-8 encoding is at most 1024 bytes long.
The Amazon S3 data model is a flat structure: you create a bucket, and the bucket stores objects. There is no hierarchy of subbuckets or subfolders; however, you can infer logical hierarchy using key name prefixes and delimiters as the Amazon S3 console does. The Amazon S3 console supports a concept of folders. Suppose that your bucket (admin-created) has four objects with the following object keys:
Development/Projects1.xls
Finance/statement1.pdf
Private/taxdocument.pdf
s3-dg.pdf
Reference:
AWS S3: Object Keys
Here is some example code that demonstrates how to get the bucket name and the object key.
Example:
import boto3
from pprint import pprint
def main():
def enumerate_s3():
s3 = boto3.resource('s3')
for bucket in s3.buckets.all():
print("Name: {}".format(bucket.name))
print("Creation Date: {}".format(bucket.creation_date))
for object in bucket.objects.all():
print("Object: {}".format(object))
print("Object bucket_name: {}".format(object.bucket_name))
print("Object key: {}".format(object.key))
enumerate_s3()
if __name__ == '__main__':
main()

Here is a simple function that returns you the filenames of all files or files with certain types such as 'json', 'jpg'.
def get_file_list_s3(bucket, prefix="", file_extension=None):
"""Return the list of all file paths (prefix + file name) with certain type or all
Parameters
----------
bucket: str
The name of the bucket. For example, if your bucket is "s3://my_bucket" then it should be "my_bucket"
prefix: str
The full path to the the 'folder' of the files (objects). For example, if your files are in
s3://my_bucket/recipes/deserts then it should be "recipes/deserts". Default : ""
file_extension: str
The type of the files. If you want all, just leave it None. If you only want "json" files then it
should be "json". Default: None
Return
------
file_names: list
The list of file names including the prefix
"""
import boto3
s3 = boto3.resource('s3')
my_bucket = s3.Bucket(bucket)
file_objs = my_bucket.objects.filter(Prefix=prefix).all()
file_names = [file_obj.key for file_obj in file_objs if file_extension is not None and file_obj.key.split(".")[-1] == file_extension]
return file_names

One way that I used to do this:
import boto3
s3 = boto3.resource('s3')
bucket=s3.Bucket("bucket_name")
contents = [_.key for _ in bucket.objects.all() if "subfolders/ifany/" in _.key]

Here is the solution
import boto3
s3=boto3.resource('s3')
BUCKET_NAME = 'Your S3 Bucket Name'
allFiles = s3.Bucket(BUCKET_NAME).objects.all()
for file in allFiles:
print(file.key)

I just did it like this, including the authentication method:
s3_client = boto3.client(
's3',
aws_access_key_id='access_key',
aws_secret_access_key='access_key_secret',
config=boto3.session.Config(signature_version='s3v4'),
region_name='region'
)
response = s3_client.list_objects(Bucket='bucket_name', Prefix=key)
if ('Contents' in response):
# Object / key exists!
return True
else:
# Object / key DOES NOT exist!
return False

With little modification to #Hephaeastus 's code in one of the above comments, wrote the below method to list down folders and objects (files) in a given path. Works similar to s3 ls command.
from boto3 import session
def s3_ls(profile=None, bucket_name=None, folder_path=None):
folders=[]
files=[]
result=dict()
bucket_name = bucket_name
prefix= folder_path
session = boto3.Session(profile_name=profile)
s3_conn = session.client('s3')
s3_result = s3_conn.list_objects_v2(Bucket=bucket_name, Delimiter = "/", Prefix=prefix)
if 'Contents' not in s3_result and 'CommonPrefixes' not in s3_result:
return []
if s3_result.get('CommonPrefixes'):
for folder in s3_result['CommonPrefixes']:
folders.append(folder.get('Prefix'))
if s3_result.get('Contents'):
for key in s3_result['Contents']:
files.append(key['Key'])
while s3_result['IsTruncated']:
continuation_key = s3_result['NextContinuationToken']
s3_result = s3_conn.list_objects_v2(Bucket=bucket_name, Delimiter="/", ContinuationToken=continuation_key, Prefix=prefix)
if s3_result.get('CommonPrefixes'):
for folder in s3_result['CommonPrefixes']:
folders.append(folder.get('Prefix'))
if s3_result.get('Contents'):
for key in s3_result['Contents']:
files.append(key['Key'])
if folders:
result['folders']=sorted(folders)
if files:
result['files']=sorted(files)
return result
This lists down all objects / folders in a given path. Folder_path can be left as None by default and method will list the immediate contents of the root of the bucket.

It can also be done as follows:
csv_files = s3.list_objects_v2(s3_bucket_path)
for obj in csv_files['Contents']:
key = obj['Key']

Using cloudpathlib
cloudpathlib provides a convenience wrapper so that you can use the simple pathlib API to interact with AWS S3 (and Azure blob storage, GCS, etc.). You can install with pip install "cloudpathlib[s3]".
Like with pathlib you can use glob or iterdir to list the contents of a directory.
Here's an example with a public AWS S3 bucket that you can copy and past to run.
from cloudpathlib import CloudPath
s3_path = CloudPath("s3://ladi/Images/FEMA_CAP/2020/70349")
# list items with glob
list(
s3_path.glob("*")
)[:3]
#> [ S3Path('s3://ladi/Images/FEMA_CAP/2020/70349/DSC_0001_5a63d42e-27c6-448a-84f1-bfc632125b8e.jpg'),
#> S3Path('s3://ladi/Images/FEMA_CAP/2020/70349/DSC_0002_a89f1b79-786f-4dac-9dcc-609fb1a977b1.jpg'),
#> S3Path('s3://ladi/Images/FEMA_CAP/2020/70349/DSC_0003_02c30af6-911e-4e01-8c24-7644da2b8672.jpg')]
# list items with iterdir
list(
s3_path.iterdir()
)[:3]
#> [ S3Path('s3://ladi/Images/FEMA_CAP/2020/70349/DSC_0001_5a63d42e-27c6-448a-84f1-bfc632125b8e.jpg'),
#> S3Path('s3://ladi/Images/FEMA_CAP/2020/70349/DSC_0002_a89f1b79-786f-4dac-9dcc-609fb1a977b1.jpg'),
#> S3Path('s3://ladi/Images/FEMA_CAP/2020/70349/DSC_0003_02c30af6-911e-4e01-8c24-7644da2b8672.jpg')]
Created at 2021-05-21 20:38:47 PDT by reprexlite v0.4.2

A good option may also be to run aws cli command from lambda functions
import subprocess
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def run_command(command):
command_list = command.split(' ')
try:
logger.info("Running shell command: \"{}\"".format(command))
result = subprocess.run(command_list, stdout=subprocess.PIPE);
logger.info("Command output:\n---\n{}\n---".format(result.stdout.decode('UTF-8')))
except Exception as e:
logger.error("Exception: {}".format(e))
return False
return True
def lambda_handler(event, context):
run_command('/opt/aws s3 ls s3://bucket-name')

I was stuck on this for an entire night because I just wanted to get the number of files under a subfolder but it was also returning one extra file in the content that was the subfolder itself,
After researching about it I found that this is how s3 works but I had
a scenario where I unloaded the data from redshift in the following directory
s3://bucket_name/subfolder/<10 number of files>
and when I used
paginator.paginate(Bucket=price_signal_bucket_name,Prefix=new_files_folder_path+"/")
it would only return the 10 files, but when I created the folder on the s3 bucket itself then it would also return the subfolder
Conclusion
If the whole folder is uploaded to s3 then listing the only returns the files under prefix
But if the fodler was created on the s3 bucket itself then listing it using boto3 client will also return the subfolder and the files

First, create an s3 client object:
s3_client = boto3.client('s3')
Next, create a variable to hold the bucket name and folder. Pay attention to the slash "/" ending the folder name:
bucket_name = 'my-bucket'
folder = 'some-folder/'
Next, call s3_client.list_objects_v2 to get the folder's content object's metadata:
response = s3_client.list_objects_v2(
Bucket=bucket_name,
Prefix=folder
)
Finally, with the object's metadata, you can obtain the S3 object by calling the s3_client.get_object function:
for object_metadata in response['Contents']:
object_key = object_metadata['Key']
response = s3_client.get_object(
Bucket=bucket_name,
Key=object_key
)
object_body = response['Body'].read()
print(object_body)
As you can see, the object content in the string format is available by calling response['Body'].read()

Related

How to write pyarrow table as csv to s3 directly? [duplicate]

In boto 2, you can write to an S3 object using these methods:
Key.set_contents_from_string()
Key.set_contents_from_file()
Key.set_contents_from_filename()
Key.set_contents_from_stream()
Is there a boto 3 equivalent? What is the boto3 method for saving data to an object stored on S3?
In boto 3, the 'Key.set_contents_from_' methods were replaced by
Object.put()
Client.put_object()
For example:
import boto3
some_binary_data = b'Here we have some data'
more_binary_data = b'Here we have some more data'
# Method 1: Object.put()
s3 = boto3.resource('s3')
object = s3.Object('my_bucket_name', 'my/key/including/filename.txt')
object.put(Body=some_binary_data)
# Method 2: Client.put_object()
client = boto3.client('s3')
client.put_object(Body=more_binary_data, Bucket='my_bucket_name', Key='my/key/including/anotherfilename.txt')
Alternatively, the binary data can come from reading a file, as described in the official docs comparing boto 2 and boto 3:
Storing Data
Storing data from a file, stream, or string is easy:
# Boto 2.x
from boto.s3.key import Key
key = Key('hello.txt')
key.set_contents_from_file('/tmp/hello.txt')
# Boto 3
s3.Object('mybucket', 'hello.txt').put(Body=open('/tmp/hello.txt', 'rb'))
boto3 also has a method for uploading a file directly:
s3 = boto3.resource('s3')
s3.Bucket('bucketname').upload_file('/local/file/here.txt','folder/sub/path/to/s3key')
http://boto3.readthedocs.io/en/latest/reference/services/s3.html#S3.Bucket.upload_file
You no longer have to convert the contents to binary before writing to the file in S3. The following example creates a new text file (called newfile.txt) in an S3 bucket with string contents:
import boto3
s3 = boto3.resource(
's3',
region_name='us-east-1',
aws_access_key_id=KEY_ID,
aws_secret_access_key=ACCESS_KEY
)
content="String content to write to a new S3 file"
s3.Object('my-bucket-name', 'newfile.txt').put(Body=content)
Here's a nice trick to read JSON from s3:
import json, boto3
s3 = boto3.resource("s3").Bucket("bucket")
json.load_s3 = lambda f: json.load(s3.Object(key=f).get()["Body"])
json.dump_s3 = lambda obj, f: s3.Object(key=f).put(Body=json.dumps(obj))
Now you can use json.load_s3 and json.dump_s3 with the same API as load and dump
data = {"test":0}
json.dump_s3(data, "key") # saves json to s3://bucket/key
data = json.load_s3("key") # read json from s3://bucket/key
A cleaner and concise version which I use to upload files on the fly to a given S3 bucket and sub-folder-
import boto3
BUCKET_NAME = 'sample_bucket_name'
PREFIX = 'sub-folder/'
s3 = boto3.resource('s3')
# Creating an empty file called "_DONE" and putting it in the S3 bucket
s3.Object(BUCKET_NAME, PREFIX + '_DONE').put(Body="")
Note: You should ALWAYS put your AWS credentials (aws_access_key_id and aws_secret_access_key) in a separate file, for example- ~/.aws/credentials
After some research, I found this. It can be achieved using a simple csv writer. It is to write a dictionary to CSV directly to S3 bucket.
eg: data_dict = [{"Key1": "value1", "Key2": "value2"}, {"Key1": "value4", "Key2": "value3"}]
assuming that the keys in all the dictionary are uniform.
import csv
import boto3
# Sample input dictionary
data_dict = [{"Key1": "value1", "Key2": "value2"}, {"Key1": "value4", "Key2": "value3"}]
data_dict_keys = data_dict[0].keys()
# creating a file buffer
file_buff = StringIO()
# writing csv data to file buffer
writer = csv.DictWriter(file_buff, fieldnames=data_dict_keys)
writer.writeheader()
for data in data_dict:
writer.writerow(data)
# creating s3 client connection
client = boto3.client('s3')
# placing file to S3, file_buff.getvalue() is the CSV body for the file
client.put_object(Body=file_buff.getvalue(), Bucket='my_bucket_name', Key='my/key/including/anotherfilename.txt')
it is worth mentioning smart-open that uses boto3 as a back-end.
smart-open is a drop-in replacement for python's open that can open files from s3, as well as ftp, http and many other protocols.
for example
from smart_open import open
import json
with open("s3://your_bucket/your_key.json", 'r') as f:
data = json.load(f)
The aws credentials are loaded via boto3 credentials, usually a file in the ~/.aws/ dir or an environment variable.
You may use the below code to write, for example an image to S3 in 2019. To be able to connect to S3 you will have to install AWS CLI using command pip install awscli, then enter few credentials using command aws configure:
import urllib3
import uuid
from pathlib import Path
from io import BytesIO
from errors import custom_exceptions as cex
BUCKET_NAME = "xxx.yyy.zzz"
POSTERS_BASE_PATH = "assets/wallcontent"
CLOUDFRONT_BASE_URL = "https://xxx.cloudfront.net/"
class S3(object):
def __init__(self):
self.client = boto3.client('s3')
self.bucket_name = BUCKET_NAME
self.posters_base_path = POSTERS_BASE_PATH
def __download_image(self, url):
manager = urllib3.PoolManager()
try:
res = manager.request('GET', url)
except Exception:
print("Could not download the image from URL: ", url)
raise cex.ImageDownloadFailed
return BytesIO(res.data) # any file-like object that implements read()
def upload_image(self, url):
try:
image_file = self.__download_image(url)
except cex.ImageDownloadFailed:
raise cex.ImageUploadFailed
extension = Path(url).suffix
id = uuid.uuid1().hex + extension
final_path = self.posters_base_path + "/" + id
try:
self.client.upload_fileobj(image_file,
self.bucket_name,
final_path
)
except Exception:
print("Image Upload Error for URL: ", url)
raise cex.ImageUploadFailed
return CLOUDFRONT_BASE_URL + id

How to filter Boto3 s3 objects?

I want to change this line
files = os.listdir('/Users/milenko/mario/Json_gzips')
in my code,to read .gz files from my bucket straight into list.
I tried
>>> import boto3
>>> s3 = boto3.resource('s3')
>>> s3
s3.ServiceResource()
>>> my_bucket = s3.Bucket('cw-dushpica-tests')
>>> for object_summary in my_bucket.objects.filter(Prefix='*.gz'):
... print(object_summary)
There is no output,it does print nothing.
for object_summary in my_bucket.objects.filter(Prefix='/'):
... print(object_summary)
The same,got nothing.
How should my prefix look like?
The prefix parameter of the filter method means that
Prefix (string) -- Limits the response to keys that begin with the specified prefix.
So, you can limit the path to the specific folder and then filter by yourself for the file extension.
import boto3
s3 = boto3.resource('s3')
bucket = s3.Bucket('your_bucket')
keys = []
for obj in bucket.objects.filter(Prefix='path/to/files/'):
if obj.key.endswith('gz'):
keys.append(obj.key)
print(keys)

AWS Rekognition and s3 calling subfolders in Python Lambda

I'm having trouble figuring out how to access a certain folder within a bucket in s3 using Python
Let's say I'm trying to access this folder in the bucket which contains a bunch of images that I want to run rekognition on:
"myBucket/subfolder/images/"
In /images/ folder there are:
one.jpg
two.jpg
three.jpg
four.jpg
I want to run rekognition's detect_labels on this folder. However, I can't seem to access this folder but if I change the bucket_name to just the root folder ("myBucket"/), then I can access just that folder.
bucket_name = "myBucket/subfolder/images/"
rekognition = boto3.client('rekognition')
s3 = boto3.resource('s3')
bucket = s3.Bucket(name=bucket_name)
That is operating as expected. The bucket name should be just the name of the bucket.
You can then run operation on the bucket, eg:
import boto3
s3 = boto3.resource('s3', region_name='ap-southeast-2')
bucket = s3.Bucket('my-bucket')
for object in bucket.objects.all():
if object.key.startswith('images'):
print object.key
Or, using the client instead of the resource:
import boto3
client = boto3.client('s3', region_name='ap-southeast-2')
response = client.list_objects_v2(Bucket='my-bucket', Prefix='images/')
for object in response['Contents']:
print object['Key']
You could index elsewhere what you have in S3, in such a way, you directly access what you need. Have in mind that to loop against files stored in buckets, may offer very low performance and it gets really slow if the number of keys you have is big.
Following your example, another way to do it:
bucket_name = "myBucket"
folder_name = "subfolder/images/"
rekognition = boto3.client('rekognition')
keys= ['one.jpg','two.jpg','three.jpg','four.jpg']
s3 = boto3.resource('s3')
for k in keys:
obj = s3.Object(bucket_name, folder_name+k )
print(obj.key)
Get the list of items(keys) from any db table in your system.
In the case of AWS Rekognition (as asked), an image file stored in a folder in a S3 bucket will have the key in the form of folder_name/subfolder_name/image_name.jpg. So, since the boto3 Rekognition detect_labels() method has this syntax (per https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/rekognition.html#Rekognition.Client.detect_labels):
response = client.detect_labels(
Image={
'Bytes': b'bytes',
'S3Object': {
'Bucket': 'string',
'Name': 'string',
'Version': 'string'
}
},
MaxLabels=123,
MinConfidence=...
)
where the value of Name should be a S3 object key name, you can just feed the entire folder-image path to that dictionary as a string. To loop over multiple images, generate a list of image file names, as recommended in Evhz's answer, and loop over that list while calling the detect_labels() method above (or use a generator).

Search specific file in AWS S3 bucket using python

I have AWS S3 access and the bucket has nearly 300 files inside the bucket. I need to download single file from this bucket by pattern matching or search because i do not know the exact filename (Say files ends with .csv format).
Here is my sample code which shows all files inside the bucket
def s3connection(credentialsdict):
"""
:param access_key: Access key for AWS to establish S3 connection
:param secret_key: Secret key for AWS to establish S3 connection
:param file_name: file name of the billing file(csv file)
:param bucket_name: Name of the bucket which consists of billing files
:return: status, billing_bucket, billing_key
"""
os.environ['S3_USE_SIGV4'] = 'True'
conn = S3Connection(credentialsdict["access_key"], credentialsdict["secret_key"], host='s3.amazonaws.com')
billing_bucket = conn.get_bucket(credentialsdict["bucket_name"], validate=False)
try:
billing_bucket.get_location()
except S3ResponseError as e:
if e.status == 400 and e.error_code == 'AuthorizationHeaderMalformed':
conn.auth_region_name = ET.fromstring(e.body).find('./Region').text
billing_bucket = conn.get_bucket(credentialsdict["bucket_name"])
print billing_bucket
if not billing_bucket:
raise Exception("Please Enter valid bucket name. Bucket %s does not exist"
% credentialsdict.get("bucket_name"))
for key in billing_bucket.list():
print key.name
del os.environ['S3_USE_SIGV4']
Can I pass search string to retrieve the exact matched filenames?
You can use JMESPath expressions to search and filter down S3 files. To do that you need to get s3 paginator over list_objects_v2.
import boto3
client = boto3.client('s3')
paginator = client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket="your_bucket_name")
Now that you have iterator you can use JMESPath search.
Most useful is contains - to do %like% query
objects = page_iterator.search("Contents[?contains(Key, `partial-file-name`)][]")
But in your case (to find all files ending .csv it's better to use ends_with - to do *.csv query
objects = page_iterator.search("Contents[?ends_with(Key, `.csv`)][]")
Then you can get object keys with
for item in objects:
print(item['Key'])
This answer is based on https://blog.jeffbryner.com/2020/04/21/jupyter-pandas-analysis.html and https://stackoverflow.com/a/27274997/4587704
There is no way to do this because there is no native support for regex in S3. You have to get the entire list and apply the search/regex at the client side. The only filtering option available in list_objects is by prefix.
list_objects
Prefix (string) -- Limits the response to keys that begin with the
specified prefix.
One option is to use the Python module re and apply it to the list of objects.
import re
pattern = re.compile(<file_pattern_you_are_looking_for>)
for key in billing_bucket.list():
if pattern.match(key.name):
print key.name
You can also use the simple if condition like,
prefix_objs = buck.objects.filter(Prefix="your_bucket_path")
for obj in prefix_objs:
key = obj.key
if key.endswith(".csv"):
body = obj.get()['Body'].read()
print(obj.key)

How to save S3 object to a file using boto3

I'm trying to do a "hello world" with new boto3 client for AWS.
The use-case I have is fairly simple: get object from S3 and save it to the file.
In boto 2.X I would do it like this:
import boto
key = boto.connect_s3().get_bucket('foo').get_key('foo')
key.get_contents_to_filename('/tmp/foo')
In boto 3 . I can't find a clean way to do the same thing, so I'm manually iterating over the "Streaming" object:
import boto3
key = boto3.resource('s3').Object('fooo', 'docker/my-image.tar.gz').get()
with open('/tmp/my-image.tar.gz', 'w') as f:
chunk = key['Body'].read(1024*8)
while chunk:
f.write(chunk)
chunk = key['Body'].read(1024*8)
or
import boto3
key = boto3.resource('s3').Object('fooo', 'docker/my-image.tar.gz').get()
with open('/tmp/my-image.tar.gz', 'w') as f:
for chunk in iter(lambda: key['Body'].read(4096), b''):
f.write(chunk)
And it works fine. I was wondering is there any "native" boto3 function that will do the same task?
There is a customization that went into Boto3 recently which helps with this (among other things). It is currently exposed on the low-level S3 client, and can be used like this:
s3_client = boto3.client('s3')
open('hello.txt').write('Hello, world!')
# Upload the file to S3
s3_client.upload_file('hello.txt', 'MyBucket', 'hello-remote.txt')
# Download the file from S3
s3_client.download_file('MyBucket', 'hello-remote.txt', 'hello2.txt')
print(open('hello2.txt').read())
These functions will automatically handle reading/writing files as well as doing multipart uploads in parallel for large files.
Note that s3_client.download_file won't create a directory. It can be created as pathlib.Path('/path/to/file.txt').parent.mkdir(parents=True, exist_ok=True).
boto3 now has a nicer interface than the client:
resource = boto3.resource('s3')
my_bucket = resource.Bucket('MyBucket')
my_bucket.download_file(key, local_filename)
This by itself isn't tremendously better than the client in the accepted answer (although the docs say that it does a better job retrying uploads and downloads on failure) but considering that resources are generally more ergonomic (for example, the s3 bucket and object resources are nicer than the client methods) this does allow you to stay at the resource layer without having to drop down.
Resources generally can be created in the same way as clients, and they take all or most of the same arguments and just forward them to their internal clients.
For those of you who would like to simulate the set_contents_from_string like boto2 methods, you can try
import boto3
from cStringIO import StringIO
s3c = boto3.client('s3')
contents = 'My string to save to S3 object'
target_bucket = 'hello-world.by.vor'
target_file = 'data/hello.txt'
fake_handle = StringIO(contents)
# notice if you do fake_handle.read() it reads like a file handle
s3c.put_object(Bucket=target_bucket, Key=target_file, Body=fake_handle.read())
For Python3:
In python3 both StringIO and cStringIO are gone. Use the StringIO import like:
from io import StringIO
To support both version:
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
# Preface: File is json with contents: {'name': 'Android', 'status': 'ERROR'}
import boto3
import io
s3 = boto3.resource('s3')
obj = s3.Object('my-bucket', 'key-to-file.json')
data = io.BytesIO()
obj.download_fileobj(data)
# object is now a bytes string, Converting it to a dict:
new_dict = json.loads(data.getvalue().decode("utf-8"))
print(new_dict['status'])
# Should print "Error"
Note: I'm assuming you have configured authentication separately. Below code is to download the single object from the S3 bucket.
import boto3
#initiate s3 client
s3 = boto3.resource('s3')
#Download object to the file
s3.Bucket('mybucket').download_file('hello.txt', '/tmp/hello.txt')
If you wish to download a version of a file, you need to use get_object.
import boto3
bucket = 'bucketName'
prefix = 'path/to/file/'
filename = 'fileName.ext'
s3c = boto3.client('s3')
s3r = boto3.resource('s3')
if __name__ == '__main__':
for version in s3r.Bucket(bucket).object_versions.filter(Prefix=prefix + filename):
file = version.get()
version_id = file.get('VersionId')
obj = s3c.get_object(
Bucket=bucket,
Key=prefix + filename,
VersionId=version_id,
)
with open(f"{filename}.{version_id}", 'wb') as f:
for chunk in obj['Body'].iter_chunks(chunk_size=4096):
f.write(chunk)
Ref: https://botocore.amazonaws.com/v1/documentation/api/latest/reference/response.html
When you want to read a file with a different configuration than the default one, feel free to use either mpu.aws.s3_download(s3path, destination) directly or the copy-pasted code:
def s3_download(source, destination,
exists_strategy='raise',
profile_name=None):
"""
Copy a file from an S3 source to a local destination.
Parameters
----------
source : str
Path starting with s3://, e.g. 's3://bucket-name/key/foo.bar'
destination : str
exists_strategy : {'raise', 'replace', 'abort'}
What is done when the destination already exists?
profile_name : str, optional
AWS profile
Raises
------
botocore.exceptions.NoCredentialsError
Botocore is not able to find your credentials. Either specify
profile_name or add the environment variables AWS_ACCESS_KEY_ID,
AWS_SECRET_ACCESS_KEY and AWS_SESSION_TOKEN.
See https://boto3.readthedocs.io/en/latest/guide/configuration.html
"""
exists_strategies = ['raise', 'replace', 'abort']
if exists_strategy not in exists_strategies:
raise ValueError('exists_strategy \'{}\' is not in {}'
.format(exists_strategy, exists_strategies))
session = boto3.Session(profile_name=profile_name)
s3 = session.resource('s3')
bucket_name, key = _s3_path_split(source)
if os.path.isfile(destination):
if exists_strategy is 'raise':
raise RuntimeError('File \'{}\' already exists.'
.format(destination))
elif exists_strategy is 'abort':
return
s3.Bucket(bucket_name).download_file(key, destination)
from collections import namedtuple
S3Path = namedtuple("S3Path", ["bucket_name", "key"])
def _s3_path_split(s3_path):
"""
Split an S3 path into bucket and key.
Parameters
----------
s3_path : str
Returns
-------
splitted : (str, str)
(bucket, key)
Examples
--------
>>> _s3_path_split('s3://my-bucket/foo/bar.jpg')
S3Path(bucket_name='my-bucket', key='foo/bar.jpg')
"""
if not s3_path.startswith("s3://"):
raise ValueError(
"s3_path is expected to start with 's3://', " "but was {}"
.format(s3_path)
)
bucket_key = s3_path[len("s3://"):]
bucket_name, key = bucket_key.split("/", 1)
return S3Path(bucket_name, key)

Categories

Resources