I'm trying to get the total size of a bucket. However total_size returns 0. Of course there are a couple of files in the bucket. If I have five files in my bucket the following function prints five zeros. What am I doing wrong?
bucket = boto3.resource('s3', config=Config(signature_version="s3", s3={'addressing_style': 'path'})).Bucket(name)
for object in bucket.objects.all():
total_size += object.size
print(object.size)
I see few issues:
Not sure about your call to boto3.resource(). Is that correct?
total_size not initialized
Try this:
total_size = 0
bucket = boto3.resource('s3').Bucket('mybucket')
for object in bucket.objects.all():
total_size += object.size
print(object.size)
print(total_size)
Or a one liner:
sum([object.size for object in boto3.resource('s3').Bucket('mybucket').objects.all()])
I am using this:
s3client = boto3.client('s3', region_name=region,
aws_access_key_id=access_key,
aws_secret_access_key=secret_key)
response = s3client.list_objects(Bucket=bucket_name)['Contents']
bucket_size = sum(obj['Size'] for obj in response)
Change signature_version="s3" to signature_version="s3v4".
I also like helloV's answer.
Also specify the region for the bucket instead of relying on the default configuration.
You can use this to get the size in GB:
import boto3
s3 = boto3.resource('s3')
bytes = sum([object.size for object in s3.Bucket('myBucket').objects.all()])
print(f'total bucket size: {bytes//1000/1024/1024} GB')
A simpler alternative is to use Amazon S3 Inventory to dump a list of objects on a daily basis, then calculate the totals from that.
I wrote a python function which returns the bucket size using a daily metric stored in cloudwatch:
def get_bucket_size(bucket_name: str, region: str):
cloudwatch = boto3.client("cloudwatch", region_name=region)
result = cloudwatch.get_metric_statistics(
Namespace="AWS/S3",
Dimensions=[{"Name": "BucketName", "Value": bucket_name},
{"Name": "StorageType", "Value": "StandardStorage"}],
MetricName="BucketSizeBytes",
StartTime=datetime.now() - timedelta(2),
EndTime=datetime.now(),
Period=86400,
Statistics=['Average'],
)
return result["Datapoints"][0]["Average"]
Here's my solution, similar to #Rohit G's except it accounts for list_objects being deprecated in preference for list_objects_v2 and that list_objects_v2 returns a max of 1000 keys (this is the same behavior as list_objects, so #Rohit G's solution, if used, should be updated to consider this - source).
I also included logic for specifying a prefix should anyone want to get just the size of a particular prefix in the bucket, but using as written will get the size of the entire bucket:
import boto3
s3 = boto3.client('s3')
bucket= 'myBucket'
prefix = ''
resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
total_size = sum([obj.get('Size') for obj in resp.get('Contents')])
while resp.get('NextContinuationToken'):
resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=resp.get('NextContinuationToken'))
total_size += sum([obj.get('Size') for obj in resp.get('Contents')])
print(f"Size (bytes): {total_size}")
Related
I want to copy a sub-subfolder in an S3 bucket into a different bucket using Python (boto3).
However, the process is painfully slow.
If I copy the folder "by hand" straight on S3 from the browser, the process takes 72 seconds (for a folder with around 140 objects, total size roughly 1.0 GB).
However, if I try to copy it with boto3, it takes 9 times longer (653 seconds).
This is the code that I am using, re-adapted from the boto3 documentation and various answers here in SO:
import boto3
s3 = boto3.resource('s3')
# define source bucket
src_bucket_name = 'bucket_1'
prefix = 'folder_1/'
client = boto3.client('s3')
src_bucket = s3.Bucket(src_bucket_name)
# define destination bucket
dest_bucket_name = 'bucket_2'
dest_bucket = s3.Bucket(dest_bucket_name)
folder = "folder_1/subfolder_1"
response_sub = client.list_objects_v2(Bucket=src_bucket_name, Prefix = folder)
# list files to be copied (select only images, but in this folder there are only images anyway)
files_src = [prefix['Key'] for prefix in response_sub['Contents'] if prefix['Key'].split('.')[-1].lower() in ['jpg','jpeg','png','tiff'] ]
# list of file names after copy
dest_prefix = 'folder_1/subfolder_1/'
files_dest = [dest_prefix+i for i in files_src]
for src,dest in zip(files_src,files_dest):
copy_source = {
'Bucket': src_bucket_name,
'Key': src
}
dest_bucket.copy(copy_source, dest)
Note that up to the last for loop, the code takes a couple of seconds only to run.
Any idea of how to speed up this? Am I doing something stupid/should use some other way of copying files/entire folders?
Thanks to #Suyog Shimpi (who pointed to a similar SO post), I was able to significantly speed up the copying process.
Here the code slightly readapted from the other post:
import os
import boto3
import botocore
import boto3.s3.transfer as s3transfer
import tqdm
s3 = boto3.resource('s3')
# define source bucket
src_bucket_name = 'bucket_1'
prefix = 'folder_1/'
client = boto3.client('s3')
src_bucket = s3.Bucket(src_bucket_name)
# define destination bucket
dest_bucket_name = 'bucket_2'
dest_bucket = s3.Bucket(dest_bucket_name)
folder = "folder_1/subfolder_1"
response_sub = client.list_objects_v2(Bucket=src_bucket_name, Prefix = folder)
# list files to be copied (select only images, but in this folder there are only images anyway)
files_src = [prefix['Key'] for prefix in response_sub['Contents'] if prefix['Key'].split('.')[-1].lower() in ['jpg','jpeg','png','tiff'] ]
# list of file names after copy
dest_prefix = 'folder_1/subfolder_1/'
files_dest = [dest_prefix+i for i in files_src]
botocore_config = botocore.config.Config(max_pool_connections=20)
s3client = boto3.client('s3', config=botocore_config)
transfer_config = s3transfer.TransferConfig(
use_threads=True,
max_concurrency=20,
)
# note that timing the process is optional
# total_size of the files can be obtained with boto3, or on the browser
%time
progress = tqdm.tqdm(
desc='upload',
total=total_size, unit='B', unit_scale=1,
position=0,
bar_format='{desc:<10}{percentage:3.0f}%|{bar:10}{r_bar}')
s3t = s3transfer.create_transfer_manager(s3client, transfer_config)
for src,dest in zip(files_src,files_dest):
copy_source = {
'Bucket': src_bucket_name,
'Key': src
}
s3t.copy(copy_source=copy_source,
bucket = dest_bucket_name,
key = dest,
subscribers=[s3transfer.ProgressCallbackInvoker(progress.update),],
)
# close transfer job
s3t.shutdown()
progress.close();
Thanks Fraccalo for your solution, it helped me a lot!
I adjusted it a little so that we can copy more than 1000 files:
import boto3
import botocore
import boto3.s3.transfer as s3transfer
import tqdm
s3 = boto3.resource('s3')
# define source bucket
src_bucket_name = 'bucket_1'
prefix = 'folder_1/'
client = boto3.client('s3')
src_bucket = s3.Bucket(src_bucket_name)
# define destination bucket
dest_bucket_name = 'bucket_2'
dest_bucket = s3.Bucket(dest_bucket_name)
folder = "folder_1/subfolder_1"
files_src = []
bucket_size = 0
# use paginator to read more than 1000 files
paginator = client.get_paginator('list_objects_v2')
operation_parameters = {'Bucket': src_bucket_name,
'Prefix': folder}
page_iterator = paginator.paginate(**operation_parameters)
for page in page_iterator:
if page.get('Contents', None):
files_src.extend([prefix['Key'] for prefix in page['Contents']])
bucket_size += sum(obj['Size'] for obj in page['Contents'])
# list of file names after copy
dest_prefix = 'folder_1/subfolder_1/'
files_dest = [dest_prefix+i for i in files_src]
botocore_config = botocore.config.Config(max_pool_connections=20)
s3client = boto3.client('s3', config=botocore_config)
transfer_config = s3transfer.TransferConfig(
use_threads=True,
max_concurrency=20,
)
progress = tqdm.tqdm(
desc='upload',
total=bucket_size, unit='B', unit_scale=1,
position=0,
bar_format='{desc:<10}{percentage:3.0f}%|{bar:10}{r_bar}')
s3t = s3transfer.create_transfer_manager(s3client, transfer_config)
for src,dest in zip(files_src,files_dest):
copy_source = {
'Bucket': src_bucket_name,
'Key': src
}
s3t.copy(copy_source=copy_source,
bucket = dest_bucket_name,
key = dest,
subscribers=[s3transfer.ProgressCallbackInvoker(progress.update),],
)
# close transfer job
s3t.shutdown()
progress.close();
I have more than 500,000 objects on s3. I am trying to get the size of each object. I am using the following python code for that
import boto3
bucket = 'bucket'
prefix = 'prefix'
contents = boto3.client('s3').list_objects_v2(Bucket=bucket, MaxKeys=1000, Prefix=prefix)["Contents"]
for c in contents:
print(c["Size"])
But it just gave me the size of the top 1000 objects. Based on the documentation we can't get more than 1000. Is there any way I can get more than that?
The inbuilt boto3 Paginator class is the easiest way to overcome the 1000 record limitation of list-objects-v2. This can be implemented as follows
s3 = boto3.client('s3')
paginator = s3.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket='bucket', Prefix='prefix')
for page in pages:
for obj in page['Contents']:
print(obj['Size'])
For more details: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Paginator.ListObjectsV2
Use the ContinuationToken returned in the response as a parameter for subsequent calls, until the IsTruncated value returned in the response is false.
This can be factored into a neat generator function:
def get_all_s3_objects(s3, **base_kwargs):
continuation_token = None
while True:
list_kwargs = dict(MaxKeys=1000, **base_kwargs)
if continuation_token:
list_kwargs['ContinuationToken'] = continuation_token
response = s3.list_objects_v2(**list_kwargs)
yield from response.get('Contents', [])
if not response.get('IsTruncated'): # At the end of the list?
break
continuation_token = response.get('NextContinuationToken')
for file in get_all_s3_objects(boto3.client('s3'), Bucket=bucket, Prefix=prefix):
print(file['Size'])
If you don't NEED to use the boto3.client you can use boto3.resource to get a complete list of your files:
s3r = boto3.resource('s3')
bucket = s3r.Bucket('bucket_name')
files_in_bucket = list(bucket.objects.all())
Then to get the size just:
sizes = [f.size for f in files_in_bucket]
Depending on the size of your bucket this might take a minute.
I have a versioned bucket and would like to delete the object (and all of its versions) from the bucket. However, when I try to delete the object from the console, S3 simply adds a delete marker but does not perform a hard delete.
Is it possible to delete all versions of the object (hard delete) with a particular key?:
s3resource = boto3.resource('s3')
bucket = s3resource.Bucket('my_bucket')
obj = bucket.Object('my_object_key')
# I would like to delete all versions for the object like so:
obj.delete_all_versions()
# or delete all versions for all objects like so:
bucket.objects.delete_all_versions()
The other answers delete objects individually. It is more efficient to use the delete_objects boto3 call and batch process your delete. See the code below for a function which collects all objects and deletes in batches of 1000:
bucket = 'bucket-name'
s3_client = boto3.client('s3')
object_response_paginator = s3_client.get_paginator('list_object_versions')
delete_marker_list = []
version_list = []
for object_response_itr in object_response_paginator.paginate(Bucket=bucket):
if 'DeleteMarkers' in object_response_itr:
for delete_marker in object_response_itr['DeleteMarkers']:
delete_marker_list.append({'Key': delete_marker['Key'], 'VersionId': delete_marker['VersionId']})
if 'Versions' in object_response_itr:
for version in object_response_itr['Versions']:
version_list.append({'Key': version['Key'], 'VersionId': version['VersionId']})
for i in range(0, len(delete_marker_list), 1000):
response = s3_client.delete_objects(
Bucket=bucket,
Delete={
'Objects': delete_marker_list[i:i+1000],
'Quiet': True
}
)
print(response)
for i in range(0, len(version_list), 1000):
response = s3_client.delete_objects(
Bucket=bucket,
Delete={
'Objects': version_list[i:i+1000],
'Quiet': True
}
)
print(response)
The documentation is helpful here:
When versioning is enabled in an S3 bucket, a simple DeleteObject request cannot permanently delete an object from that bucket. Instead, Amazon S3 inserts a delete marker (which is effectively a new version of the object with its own version ID).
When you try to GET an object whose current version is a delete marker, S3 behaves as if the object has been deleted (even though it has not) and returns a 404 error.
To permanently delete an object from a versioned bucket, use DeleteObject, with the relevant version ID, for each and every version of the object (and that includes the delete markers).
I had trouble using the other solutions to this question so here's mine.
import boto3
bucket = "bucket name goes here"
filename = "filename goes here"
client = boto3.client('s3')
paginator = client.get_paginator('list_object_versions')
response_iterator = paginator.paginate(Bucket=bucket)
for response in response_iterator:
versions = response.get('Versions', [])
versions.extend(response.get('DeleteMarkers', []))
for version_id in [x['VersionId'] for x in versions
if x['Key'] == filename and x['VersionId'] != 'null']:
print('Deleting {} version {}'.format(filename, version_id))
client.delete_object(Bucket=bucket, Key=filename, VersionId=version_id)
This code deals with the cases where
object versioning isn't actually turned on
there are DeleteMarkers
there are no DeleteMarkers
there are more versions of a given file than fit in a single API response
Mahesh Mogal's answer doesn't delete DeleteMarkers. Mangohero1's answer fails if the object is missing a DeleteMarker. Hari's answer repeats 10 times (to workaround missing pagination logic).
You can use object_versions.
def delete_all_versions(bucket_name: str, prefix: str):
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)
if prefix is None:
bucket.object_versions.delete()
else:
bucket.object_versions.filter(Prefix=prefix).delete()
delete_all_versions("my_bucket", None) # empties the entire bucket
delete_all_versions("my_bucket", "my_prefix/") # deletes all objects matching the prefix (can be only one if only one matches)
As a supplement to #jarmod's answer, here is a way I developed a workaround to "hard deleting" an object (with delete markered objects included);
def get_all_versions(bucket, filename):
s3 = boto3.client('s3')
keys = ["Versions", "DeleteMarkers"]
results = []
for k in keys:
response = s3.list_object_versions(Bucket=bucket)[k]
to_delete = [r["VersionId"] for r in response if r["Key"] == filename]
results.extend(to_delete)
return results
bucket = "YOUR BUCKET NAME"
file = "YOUR FILE"
for version in get_all_versions(bucket, file):
s3.delete_object(Bucket=bucket, Key=file, VersionId=version)
Fewer line solution.
import boto3
def delete_versions(bucket, objects=None): # `objects` is either list of str or None
bucket = boto3.resource('s3').Bucket(bucket)
if objects: # delete specified objects
[version.delete() for version in bucket.object_versions.all() if version.object_key in objects]
else: # or delete all objects in `bucket`
[version.delete() for version in bucket.object_versions.all()]
To delete all versions of an object or objects under a prefix:
Pass the object key /folder/filename or prefix /folder/subfolder/ to the Prefix
import boto3
s3 = boto3.resource('s3')
bucket = s3.Bucket("my-bucket-name")
bucket.object_versions.filter(Prefix="folder/subfolder/").delete()
This post was super helpful without this we would have spent tremendous amount of time cleaning up our S3 folders.
We had a requirement to clean up specific folders only. So I tried the following code and it worked like a charm. Also note that I am iterating through the 10 times to delete more than 1000 objects limit that function has. Feel free to modify the limit as you wish.
import boto3
session = boto3.Session(aws_access_key_id='<YOUR ACCESS KEY>',aws_secret_access_key='<YOUR SECRET KEY>')
bucket_name = '<BUCKET NAME>'
object_name = '<KEY NAME>'
s3 = session.client('s3')
for i in range(10):
versions = s3.list_object_versions (Bucket = bucket_name, Prefix = object_name)
#print (versions)
version_list = versions.get('Versions')
for version in version_list:
keyName = version.get('Key')
versionId = version.get('VersionId')
print (keyName + ':' + versionId)
s3.delete_object(Bucket = bucket_name, Key= keyName, VersionId = versionId)
marker_list = versions.get('DeleteMarkers')
#print(marker_list)
for marker in marker_list:
keyName1 = marker.get('Key')
versionId1 = marker.get('VersionId')
print (keyName1 + ':' + versionId1)
s3.delete_object(Bucket = bucket_name, Key= keyName1, VersionId = versionId1)
this script will delete all version of all object with prefix -
s3 = boto3.resource("s3")
client = boto3.client("s3")
s3_bucket = s3.Bucket(bucket_name)
for obj in s3_bucket.objects.filter(Prefix=""):
response = client.list_object_versions(Bucket=bucket_name, Prefix=obj.key)
while "Versions" in response:
to_delete = [
{"Key": ver["Key"], "VersionId": ver["VersionId"]}
for ver in response["Versions"]
]
delete = {"Objects": to_delete}
client.delete_objects(Bucket=bucket_name, Delete=delete)
response = client.list_object_versions(Bucket=bucket_name, Prefix=obj.key)
client.delete_object(Bucket=bucket_name, Key=obj.key)
Easiest way:
import boto3
bucket = boto3.resource("s3").Bucket("mybucket")
bucket.object_versions.all().delete()
You can delete an object with all of its versions using following code
session = boto3.Session(aws_access_key_id, aws_secret_access_key)
bucket_name = 'bucket_name'
object_name = 'object_name'
s3 = session.client('s3')
versions = s3.list_object_versions (Bucket = bucket_name, Prefix = object_name)
version_list = versions.get('Versions')
for version in version_list:
versionId = version.get('VersionId')
s3.delete_object(Bucket = bucket_name, Key= object_name, VersionId = versionId)
The rest of the answers all miss something. Either using the Prefix parameter, or deleting delete markers, or handling errors...
s3 = boto3.client('s3')
response = s3.list_object_versions(Bucket=bucket_name, Prefix=key)
objects_to_delete = []
# Note that we do not use pagination because we assume the file has less than max versions (something like 300)
# Note that we also traverse delete markers.
for obj in itertools.chain(response.get("Versions", []), response.get("DeleteMarkers", [])):
# NOTE: This is super stupid, but AWS has no API for list_object_versions for a single object, only with prefix.
# So other objects who share the same prefix (e.g "blaze/a.txt" and "bla.json" will also be listed when asking for "bla").
# So we need to be careful here
if obj["Key"] != key:
break
objects_to_delete.append({"Key": obj["Key"], 'VersionId': obj['VersionId']})
if len(objects_to_delete) == 0:
raise FileNotFoundError(f'File {key} not found at bucket {bucket_name}')
deletion_response = s3.delete_objects(Bucket=bucket_name, Delete={"Objects": objects_to_delete, "Quiet": False})
errors = deletion_response.get("Errors", [])
if len(errors) > 0:
raise Exception(f'Failed deleting file {key} from bucket {bucket_name}. Result: {deletion_response}')
boto3 documentation does not clearly specify how to update the user metadata of an already existing S3 Object.
It can be done using the copy_from() method -
import boto3
s3 = boto3.resource('s3')
s3_object = s3.Object('bucket-name', 'key')
s3_object.metadata.update({'id':'value'})
s3_object.copy_from(CopySource={'Bucket':'bucket-name', 'Key':'key'}, Metadata=s3_object.metadata, MetadataDirective='REPLACE')
You can do this using copy_from() on the resource (like this answer) mentions, but you can also use the client's copy_object() and specify the same source and destination. The methods are equivalent and invoke the same code underneath.
import boto3
s3 = boto3.client("s3")
src_key = "my-key"
src_bucket = "my-bucket"
s3.copy_object(Key=src_key, Bucket=src_bucket,
CopySource={"Bucket": src_bucket, "Key": src_key},
Metadata={"my_new_key": "my_new_val"},
MetadataDirective="REPLACE")
The 'REPLACE' value specifies that the metadata passed in the request should overwrite the source metadata entirely. If you mean to only add new key-values, or delete only some keys, you'd have to first read the original data, edit it and call the update.
To replacing only a subset of the metadata correctly:
Retrieve the original metadata with head_object(Key=src_key, Bucket=src_bucket). Also take note of the Etag in the response
Make desired changes to the metadata locally.
Call copy_object as above to upload the new metadata, but pass CopySourceIfMatch=original_etag in the request to ensure the remote object has the metadata you expect before overwriting it. original_etag is the one you got in step 1. In case the metadata (or the data itself) has changed since head_object was called (e.g. by another program running simultaneously), copy_object will fail with an HTTP 412 error.
Reference: boto3 issue 389
Similar to this answer but with the existing Metadata preserved while modifying only what is needed. From the system defined meta data, I've only preserved ContentType and ContentDisposition in this example. Other system defined meta data can also be preserved similarly.
import boto3
s3 = boto3.client('s3')
response = s3.head_object(Bucket=bucket_name, Key=object_name)
response['Metadata']['new_meta_key'] = "new_value"
response['Metadata']['existing_meta_key'] = "new_value"
result = s3.copy_object(Bucket=bucket_name, Key=object_name,
CopySource={'Bucket': bucket_name,
'Key': object_name},
Metadata=response['Metadata'],
MetadataDirective='REPLACE', TaggingDirective='COPY',
ContentDisposition=response['ContentDisposition'],
ContentType=response['ContentType'])
You can either update metadata by adding something or updating a current metadata value with a new one, here is the piece of code I am using :
import sys
import os
import boto3
import pprint
from boto3 import client
from botocore.utils import fix_s3_host
param_1= YOUR_ACCESS_KEY
param_2= YOUR_SECRETE_KEY
param_3= YOUR_END_POINT
param_4= YOUR_BUCKET
#Create the S3 client
s3ressource = client(
service_name='s3',
endpoint_url= param_3,
aws_access_key_id= param_1,
aws_secret_access_key=param_2,
use_ssl=True,
)
# Building a list of of object per bucket
def BuildObjectListPerBucket (variablebucket):
global listofObjectstobeanalyzed
listofObjectstobeanalyzed = []
extensions = ['.jpg','.png']
for key in s3ressource.list_objects(Bucket=variablebucket)["Contents"]:
#print (key ['Key'])
onemoreObject=key['Key']
if onemoreObject.endswith(tuple(extensions)):
listofObjectstobeanalyzed.append(onemoreObject)
#print listofObjectstobeanalyzed
else :
s3ressource.delete_object(Bucket=variablebucket,Key=onemoreObject)
return listofObjectstobeanalyzed
# for a given existing object, create metadata
def createmetdata(bucketname,objectname):
s3ressource.upload_file(objectname, bucketname, objectname, ExtraArgs={"Metadata": {"metadata1":"ImageName","metadata2":"ImagePROPERTIES" ,"metadata3":"ImageCREATIONDATE"}})
# for a given existing object, add new metadata
def ADDmetadata(bucketname,objectname):
s3_object = s3ressource.get_object(Bucket=bucketname, Key=objectname)
k = s3ressource.head_object(Bucket = bucketname, Key = objectname)
m = k["Metadata"]
m["new_metadata"] = "ImageNEWMETADATA"
s3ressource.copy_object(Bucket = bucketname, Key = objectname, CopySource = bucketname + '/' + objectname, Metadata = m, MetadataDirective='REPLACE')
# for a given existing object, update a metadata with new value
def CHANGEmetadata(bucketname,objectname):
s3_object = s3ressource.get_object(Bucket=bucketname, Key=objectname)
k = s3ressource.head_object(Bucket = bucketname, Key = objectname)
m = k["Metadata"]
m.update({'watson_visual_rec_dic':'ImageCREATIONDATEEEEEEEEEEEEEEEEEEEEEEEEEE'})
s3ressource.copy_object(Bucket = bucketname, Key = objectname, CopySource = bucketname + '/' + objectname, Metadata = m, MetadataDirective='REPLACE')
def readmetadata (bucketname,objectname):
ALLDATAOFOBJECT = s3ressource.get_object(Bucket=bucketname, Key=objectname)
ALLDATAOFOBJECTMETADATA=ALLDATAOFOBJECT['Metadata']
print ALLDATAOFOBJECTMETADATA
# create the list of object on a per bucket basis
BuildObjectListPerBucket (param_4)
# Call functions to see the results
for objectitem in listofObjectstobeanalyzed:
# CALL The function you want
readmetadata(param_4,objectitem)
ADDmetadata(param_4,objectitem)
readmetadata(param_4,objectitem)
CHANGEmetadata(param_4,objectitem)
readmetadata(param_4,objectitem)
I am not able to find any solution for recusively copying contents from one to another in s3 buckets using boto in python.
suppose a bucket B1 contains has key structure like:
B1/x/*
I want to copy all the objects recursively from key like B/x/* to B/y/*
There is not "directory" in S3. Those "/" separator is just part of object name, that's why boto doesn't have such features. Either write a script to deal with it or use third party tools.
AWS customerapps show s3browser that provide such arbitrary directory copying functionality. The typical free version only spawn two threads to move file, the paid version allow you to specify more threads and run faster.
Or you just write script and use s3.client.copy_object to copy the file to another name, then delete them afterwards. e.g.
import boto3
s3 = boto3.client("s3")
# list_objects_v2() give more info
more_objects=True
found_token = True
while more_objects :
if found_token :
response= s3.list_objects_v2(
Bucket="mybucket",
Prefix="B1/x/",
Delimiter="/")
else:
response= s3.list_objects_v2(
Bucket="mybucket",
ContinuationToken=found_token,
Prefix="B1/x/",
Delimiter="/")
# use copy_object or copy_from
for source in object_list["Contents"]:
raw_name = source["Key"].split("/")[-1]
new_name = "new_structure/{}".format(raw_name)
s3.copy_object(
....
)
# Now check there is more objects to list
if "NextContinuationToken" in response:
found_token = response["NextContinuationToken"]
more_objects = True
else:
more_objects = False
** IMPORTANT NOTES ** : list_object only return maximum 1000 keys per listing, MaxKey will not change the limit. So you must use list_objects_v2 and check whether NextContinuationToken is returned, to make sure the is more object, repeat it until exhausted.
Just trying to build on previous answer:
s3 = boto3.client('s3')
def copyFolderFromS3(pathFrom, bucketTo, locationTo):
response = {}
response['status'] = 'failed'
getBucket = pathFrom.split('/')[2]
location = '/'.join(pathFrom.split('/')[3:])
if pathFrom.startswith('s3://'):
copy_source = { 'Bucket': getBucket, 'Key': location }
uploadKey = locationTo
recursiveCopyFolderToS3(copy_source,bucketTo,uploadKey)
def recursiveCopyFolderToS3(src,uplB,uplK):
more_objects=True
found_token = True
while more_objects:
if found_token:
response = s3.list_objects_v2(
Bucket=src['Bucket'],
Prefix=src['Key'],
Delimiter="/")
else:
response = s3.list_objects_v2(
Bucket=src['Bucket'],
ContinuationToken=found_token,
Prefix=src['Key'],
Delimiter="/")
for source in response["Contents"]:
raw_name = source["Key"].split("/")[-1]
raw_name = raw_name
new_name = os.path.join(uplK,raw_name)
if raw_name.endswith('_$folder$'):
src["Key"] = source["Key"].replace('_$folder$','/')
new_name = new_name.replace('_$folder$','')
recursiveCopyFolderToS3(src,uplB,new_name)
else:
src['Key'] = source["Key"]
s3.copy_object(CopySource=src,Bucket=uplB,Key=new_name)
if "NextContinuationToken" in response:
found_token = response["NextContinuationToken"]
more_objects = True
else:
more_objects = False
Or you an also use the simple awscli which is by default installed on EC2/emr machines.
import subprocess
cmd='aws s3 cp '+path+' '+uploadUrl+' --recursive'
p=subprocess.Popen(cmd, shell=True,stdout=subprocess.PIPE)
p.communicate()
Instead of using boto3, I opt for aws-cli and sh. See the aws s3 cp docs for full list of arguments, which you can include as kwargs in the following (reworked from my own code) which can be used to copy to / from / between S3 buckets and / or local targets:
import sh # also assumes aws-cli has been installed
def s3_cp(source, target, **kwargs):
"""
Copy data from source to target. Include flags as kwargs
such as recursive=True and include=xyz
"""
args = []
for flag_name, flag_value in kwargs.items():
if flag_value is not False: # i.e. --quiet=False means omit --quiet
args.append(f"--{flag_name}")
if flag_value is not True: # i.e. --quiet=True means --quiet
args.append(flag_value)
args += [source, target]
sh.aws("s3", "cp", *args)
bucket to bucket (as per the OP's question):
s3_cp("s3://B1/x/", "s3://B1/y/", quiet=True, recursive=True)
or bucket to local:
s3_cp("s3://B1/x/", "my-local-dir/", quiet=True, recursive=True)
Personally I found that this method gave improved transfer time (of a few GB over 20k small files) from a couple of hours to a few minutes compared to boto3. Perhaps under the hood it's doing some threading or simply opening few connections - but that's just speculation.
Warning: it won't work on Windows.
Related: https://stackoverflow.com/a/46680575/1571593
Another boto3 alternative, using the higher level resource API rather than client:
import os
import boto3
def copy_prefix_within_s3_bucket(
endpoint_url: str,
bucket_name: str,
old_prefix: str,
new_prefix: str,
) -> None:
bucket = boto3.resource(
"s3",
endpoint_url=endpoint_url,
aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
).Bucket(bucket_name)
for obj in bucket.objects.filter(Prefix=old_prefix):
old_key = obj.key
new_key = old_key.replace(old_prefix, new_prefix)
copy_source = {"Bucket": bucket_name, "Key": old_key}
bucket.copy(copy_source, new_key)
if __name__ == "__main__":
copy_prefix_within_s3_bucket(
endpoint_url="my_endpoint_url",
bucket_name="my_bucket_name",
old_prefix="my_old_prefix",
new_prefix="my_new_prefix",
)