How to upload folder on Google Cloud Storage using Python API - python

I have successfully uploaded single text file on Google Cloud Storage. But when i try to upload whole folder, It gives permission denied error.
filename = "d:/foldername" #here test1 is the folder.
Error:
Traceback (most recent call last):
File "test1.py", line 142, in <module>
upload()
File "test1.py", line 106, in upload
media = MediaFileUpload(filename, chunksize=CHUNKSIZE, resumable=True)
File "D:\jatin\Project\GAE_django\GCS_test\oauth2client\util.py", line 132, in positional_wrapper
return wrapped(*args, **kwargs)
File "D:\jatin\Project\GAE_django\GCS_test\apiclient\http.py", line 422, in __init__
fd = open(self._filename, 'rb')
IOError: [Errno 13] Permission denied: 'd:/foldername'

This works for me. Copy all content from a local directory to a specific bucket-name/full-path (recursive) in google cloud storage:
import glob
from google.cloud import storage
def upload_local_directory_to_gcs(local_path, bucket, gcs_path):
assert os.path.isdir(local_path)
for local_file in glob.glob(local_path + '/**'):
if not os.path.isfile(local_file):
upload_local_directory_to_gcs(local_file, bucket, gcs_path + "/" + os.path.basename(local_file))
else:
remote_path = os.path.join(gcs_path, local_file[1 + len(local_path):])
blob = bucket.blob(remote_path)
blob.upload_from_filename(local_file)
upload_local_directory_to_gcs(local_path, bucket, BUCKET_FOLDER_DIR)

A version without a recursive function, and it works with 'top level files' (unlike the top answer):
import glob
import os
from google.cloud import storage
GCS_CLIENT = storage.Client()
def upload_from_directory(directory_path: str, dest_bucket_name: str, dest_blob_name: str):
rel_paths = glob.glob(directory_path + '/**', recursive=True)
bucket = GCS_CLIENT.get_bucket(dest_bucket_name)
for local_file in rel_paths:
remote_path = f'{dest_blob_name}/{"/".join(local_file.split(os.sep)[1:])}'
if os.path.isfile(local_file):
blob = bucket.blob(remote_path)
blob.upload_from_filename(local_file)

A folder is a cataloging structure containing references to files and directories. The library will not accept a folder as an argument.
As far as I understand, your use case is to make an upload to GCS preserving a local folder structure. To accomplish that you can use the os python module and make a recursive function (e.g process_folder) that will take path as an argument. This logic can be used for the function:
Use os.listdir() method to get a list of objects within the source path (will return both files and folders).
Iterate over a list from step 1 to separate files from folders via os.path.isdir() method.
Iterate over files and upload them with adjusted path (e.g. path+ “/“ + file_name).
Iterate over folders making a recursive call (e.g. process_folder(path+folder_name)).
It’ll be necessary to work with two paths:
Real system path (e.g. “/Users/User/…/upload_folder/folder_name”) used with os module.
Virtual path for GCS file uploads (e.g. “upload”+”/“ + folder_name + ”/“ + file_name).
Don’t forget to implement exponential backoff referenced at [1] to deal with 500 errors. You can use a Drive SDK example at [2] as a reference.
[1] - https://developers.google.com/storage/docs/json_api/v1/how-tos/upload#exp-backoff
[2] - https://developers.google.com/drive/web/handle-errors

I assume the sheer filename = "D:\foldername" is not enough info about the source code. Neither am I sure that this is even possible.. via the web interface you can also just upload files or create folders where you then upload the files.
You could save the folders name, then create it (I've never used the google-app-engine, but I guess that should be possible) and then upload the contents to the new folder

Refer -
https://hackersandslackers.com/manage-files-in-google-cloud-storage-with-python/
from os import listdir
from os.path import isfile, join
...
def upload_files(bucketName):
"""Upload files to GCP bucket."""
files = [f for f in listdir(localFolder) if isfile(join(localFolder, f))]
for file in files:
localFile = localFolder + file
blob = bucket.blob(bucketFolder + file)
blob.upload_from_filename(localFile)
return f'Uploaded {files} to "{bucketName}" bucket.'

The solution can also be used for windows systems. Simply provide the folder name to upload the destination bucket name.Additionally, it can handle any level of subdirectories in a folder.
import os
from google.cloud import storage
storage_client = storage.Client()
def upload_files(bucketName, folderName):
"""Upload files to GCP bucket."""
bucket = storage_client.get_bucket(bucketName)
for path, subdirs, files in os.walk(folderName):
for name in files:
path_local = os.path.join(path, name)
blob_path = path_local.replace('\\','/')
blob = bucket.blob(blob_path)
blob.upload_from_filename(path_local)

Here is my recursive implementation . we need to create a file named gdrive_utils.py and write the following.
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from apiclient.http import MediaFileUpload, MediaIoBaseDownload
import pickle
import glob
import os
# The following scopes are required for access to google drive.
# If modifying these scopes, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/drive.metadata.readonly',
'https://www.googleapis.com/auth/drive.metadata',
'https://www.googleapis.com/auth/drive',
'https://www.googleapis.com/auth/drive.file',
'https://www.googleapis.com/auth/drive.appdata']
def get_gdrive_service():
"""
Tries to authenticate using a token. If token expires or not present creates one.
:return: Returns authenticated service object
:rtype: object
"""
creds = None
# The file token.pickle stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.pickle'):
with open('token.pickle', 'rb') as token:
creds = pickle.load(token)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
'keys/client-secret.json', SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open('token.pickle', 'wb') as token:
pickle.dump(creds, token)
# return Google Drive API service
return build('drive', 'v3', credentials=creds)
def createRemoteFolder(drive_service, folderName, parent_id):
# Create a folder on Drive, returns the newely created folders ID
body = {
'name': folderName,
'mimeType': "application/vnd.google-apps.folder",
'parents': [parent_id]
}
root_folder = drive_service.files().create(body = body, supportsAllDrives=True, fields='id').execute()
return root_folder['id']
def upload_file(drive_service, file_location, parent_id):
# Create a folder on Drive, returns the newely created folders ID
body = {
'name': os.path.split(file_location)[1],
'parents': [parent_id]
}
media = MediaFileUpload(file_location,
resumable=True)
file_details = drive_service.files().create(body = body,
media_body=media,
supportsAllDrives=True,
fields='id').execute()
return file_details['id']
def upload_file_recursively(g_drive_service, root, folder_id):
files_list = glob.glob(root)
if files_list:
for file_contents in files_list:
if os.path.isdir(file_contents):
# create new _folder
new_folder_id = createRemoteFolder(g_drive_service, os.path.split(file_contents)[1],
folder_id)
upload_file_recursively(g_drive_service, os.path.join(file_contents, '*'), new_folder_id)
else:
# upload to given folder id
upload_file(g_drive_service, file_contents, folder_id)
After that use the following
import os
from gdrive_utils import createRemoteFolder, upload_file_recursively, get_gdrive_service
g_drive_service = get_gdrive_service()
FOLDER_ID_FOR_UPLOAD = "<replace with folder id where you want upload>"
main_folder_id = createRemoteFolder(g_drive_service, '<name_of_main_folder>', FOLDER_ID_FOR_UPLOAD)
And finally use this
upload_file_recursively(g_drive_service, os.path.join("<your_path_>", '*'), main_folder_id)

I just came across the gcsfs library which seems to be also about better interfaces
You could copy an entire directory into a gcs location like this:
def upload_to_gcs(src_dir: str, gcs_dst: str):
fs = gcsfs.GCSFileSystem()
fs.put(src_dir, gcs_dst, recursive=True)

Another option is to use gsutils, the command-line tool for interacting with Google Cloud:
gsutil cp -r ./my/local/directory gs://my_gcp_bucket/foo/bar
The -r flag tells gsutils to copy recursively. Link gsutils to documentation.
Invoking gsutils in Python can be done like this:
import subprocess
subprocess.check_call('gsutil cp -r ./my/local/directory gs://my_gcp_bucket/foo/bar')

Related

Python: How to upload folder to Google Drive

i want to upload a local folder to google drive with python
Folder example
1 folder level
C:\Users\test\Documents\google drvie\test\
the folder you want to upload
Folder name: upload
C:\Users\test\Documents\google drvie\test\upload
* In the upload folder
There is still a hierarchy of folders and files.
I want to upload all folders and files in upload.
C:\Users\test\Documents\google drvie\test\upload\upload2
C:\Users\test\Documents\google drvie\test\upload\test.txt
C:\Users\test\Documents\google drvie\test\upload\upload2\uplpad3
C:\Users\test\Documents\google drvie\test\upload\upload2\uplpad3\test.txt
I did it with the script below, but I can't upload by folder.
Only files can be uploaded.
When uploading by folder, the following access permission error is displayed.
If anyone knows, I would appreciate it if you could tell me.
error contents
Traceback
GoogleDriveFile({'parents': [{'id': '1J8TXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'}], 'title': 'upload'})
Traceback (most recent call last):
File "c:\Users\test\Documents\google drvie\googledrive_file_up.py", line 43, in <module>
f.SetContentFile(os.path.join(path,x))
File "C:\Users\test\AppData\Roaming\Python\Python39\site-packages\pydrive\files.py", line 169, in SetContentFile
self.content = open(filename, 'rb')
PermissionError: [Errno 13] Permission denied: 'C:\\Users\\test\\Documents\\google drvie\\test\\upload'
Reference page
How to Upload File to Google Drive using Python Script?
code
from pydrive.drive import GoogleDrive
from pydrive.auth import GoogleAuth
import os
#Authenticate Google services
gauth = GoogleAuth()
# load credentials or create empty credentials if none exist
gauth.LoadCredentialsFile("mycreds.txt")
#If you don't have Google service credentials
if gauth.credentials is None:
#Automatically receive authorization code from user and configure local web server
gauth. LocalWebserverAuth()
# if the access token does not exist or has expired
elif gauth.access_token_expired:
#refresh authorization for google services
gauth. Refresh()
# if none match
else:
#Authorize Google services
gauth. Authorize()
# save credentials to file in txt format
gauth.SaveCredentialsFile("mycreds.txt")
#Authentication process for Google Drive
drive = GoogleDrive(gauth)
# specify folder path to upload
path = r'C:\Users\test\Documents\google drvie\test'
# File ID to upload to GOOGLE DRIVE
folder_id='1J8TXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
Loop processing (repeated processing) by #for statement
for x in os.listdir(path):
# Create GoogleDriveFile object
#f = drive.CreateFile({'title' : x})
f = drive.CreateFile({"parents": [{"id": folder_id},]})
#file title
f['title'] = x
# set local file and upload
print(f)
f.SetContentFile(os.path.join(path,x))
print(f)
#Upload to Google Drive
f.Upload()
print(f)
f = None
Sorry for the inconvenience, but thank you in advance.

Automating running Python code using Azure services

Hi everyone on Stackoverflow,
I wrote two python scripts. One script is for picking up local files and sending them to GCS (Google Cloud Storage). Another one is opposite - for taking files from GCS that were uploaded and saving locally.
I want to automate process using Azure.
What would you recommend to use? Azure Function App, Azure Logic App or other services?
*
I'm now trying to use Logic App. I made .exe file using pyinstaller and looking for connector in Logic App that will run my program (.exe file). I have trigger in Logic App - "When a file is added or modified", but now I stack when selecting next step (connector)..
Kind regards,
Anna
Adding code as requested:
from google.cloud import storage
import os
import glob
import json
# Finding path to config file that is called "gcs_config.json" in directory C:/
def find_config(name, path):
for root, dirs, files in os.walk(path):
if name in files:
return os.path.join(root, name)
def upload_files(config_file):
# Reading 3 Parameters for upload from JSON file
with open(config_file, "r") as file:
contents = json.loads(file.read())
print(contents)
# Setting up login credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = contents['login_credentials']
# The ID of GCS bucket
bucket_name = contents['bucket_name']
# Setting path to files
LOCAL_PATH = contents['folder_from']
for source_file_name in glob.glob(LOCAL_PATH + '/**'):
# For multiple files upload
# Setting destination folder according to file name
if os.path.isfile(source_file_name):
partitioned_file_name = os.path.split(source_file_name)[-1].partition("-")
file_type_name = partitioned_file_name[0]
# Setting folder where files will be uploaded
destination_blob_name = file_type_name + "/" + os.path.split(source_file_name)[-1]
# Setting up required variables for GCS
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
# Running upload and printing confirmation message
blob.upload_from_filename(source_file_name)
print("File from {} uploaded to {} in bucket {}.".format(
source_file_name, destination_blob_name, bucket_name
))
config_file = find_config("gcs_config.json", "C:/")
upload_files(config_file)
config.json:
{
"login_credentials": "C:/Users/AS/Downloads/bright-velocity-___-53840b2f9bb4.json",
"bucket_name": "staging.bright-velocity-___.appspot.com",
"folder_from": "C:/Users/AS/Documents/Test2/",
"folder_for_downloaded_files": "C:/Users/AnnaShepilova/Documents/DownloadedFromGCS2/",
"given_date": "",
"given_prefix": ["Customer", "Account"] }
Currently, there is no built-in connector in Logic Apps for interacting with Google Cloud Services. however, you can use Google Cloud Storage does provide REST API in your Logic app or Function app.
But my suggestion is you can use the Azure Function to do these things. Because the azure Function can be more flexible to write your own flow to do the task.
Refer to run your .exe file in the Azure function. If you are using Local EXE or using Cloud Environment exe.
Refer here for more information

Python Google Drive API Slow Multi-File Upload

I'm trying to upload a directory and all of its contents to Google Drive. I can accomplish this in Python fine and the files upload, but it goes 1 file at a time with a request to the API with each file and its very slow. I'm practicing with a small directory for now, but when I have 2000 files in the future it will take for-ev-er. Is there a faster way I can accomplish it, maybe with a single request instead of a request for each file?
Thanks
Here is my main program:
# user wants to upload to Google Drive HOME-SYNC
print("4: upload to HOME-SYNC on Google Drive")
# assuming HOME-SYNC is empty, for now first step is copying directory
# structure on local machine to HOME-SYNC
# in the future need to ask if want to backup HOME-SYNC first, and if
# so back it up
# then need to empty it
# specify the start path
start_path = "/home/geoff/HOME-SYNC"
start_path = ff.abs_path_from_user_input(start_path)
print("START PATH")
print(start_path)
# create a directory object with start path
start_directory = Directory(start_path)
# create a google drive service resource
google_service = ff.create_google_token()
# create the directory tree on google drive
# '1YOTDKowprC2Paq95X-MIKSUG_vpuViQw' is the id of HOME-SYNC on
# Google Drive
start_directory.create_google_drive_tree(
'HOME-SYNC',
google_service,
'1YOTDKowprC2Paq95X-MIKSUG_vpuViQw')
print("FINISHED")
Here is my Directory Class:
class Directory():
def __init__(self, directory_path):
"""Initialize directory"""
self.directory_path = directory_path
#print("__INIT__ DIR PATH=" + self.directory_path)
def create_google_drive_tree(
self,
google_drive_folder="",
google_service=False,
parent_dir_id=''):
"""Creates the same tree in google drive that is in the Directory
object, with 'google_drive_folder' as the ROOT directory
(== Directory obj)"""
# google_drive_folder = name of the current directory
# google_service = Google API resource
# parent_dir_id = id of the parent dir on Google drive
# create the files_and_dirs list in the current directory
files_and_dirs = \
[files_and_dirs for files_and_dirs in listdir(self.directory_path)]
print(files_and_dirs)
# sorts the files and dirs so their alphabetical and files come first
files_and_dirs = \
ff.sort_files_and_dirs(self.directory_path, files_and_dirs)
# loop through files and directories, outputting if its a file or dir
# if its a dir and full_tree==true, make a recursive call by creating
# new Directory instance then listing the contents of that as well
for fd in files_and_dirs:
abs_path = ff.abs_path_from_local_dir(self.directory_path, fd)
if ff.check_file_or_dir(abs_path) == "file":
# its a file
# need to copy the file to Google Drive
file_metadata = {
'name': fd,
'parents': [parent_dir_id]
}
media = MediaFileUpload(abs_path)
file = google_service.files().create(body=file_metadata,
media_body=media,
fields='id').execute()
else:
# its a directory
# create the directory in google drive
file_metadata = {
'name': fd,
'mimeType': 'application/vnd.google-apps.folder',
'parents': [parent_dir_id]
}
file = google_service.files().create(body=file_metadata,
fields='id').execute()
# create a new Directory obj with the current Directory
# which is a subdirectory of the current Directory
sub_dir = Directory(abs_path)
# Recursively build tree inside the subdirectory
sub_dir.create_google_drive_tree(
fd,
google_service,
file.get('id'))
and I have utility functions in file_functions.py
def abs_path_from_user_input(start_path):
if start_path[:1] == '/':
path_type = "absolute"
else:
path_type = "relative"
if path_type != "absolute":
start_path = realpath(start_path)
return start_path
def abs_path_from_local_dir(directory, content):
abs_path = realpath(join(directory, content))
return abs_path
def sort_files_and_dirs(curr_path, files_and_dirs):
files = []
dirs = []
for file_dir in files_and_dirs:
abs_path = abs_path_from_local_dir(curr_path, file_dir)
if check_file_or_dir(abs_path) == "file":
files.append(file_dir)
else:
dirs.append(file_dir)
files.sort()
dirs.sort()
combined = []
for f in files:
combined.append(f)
for d in dirs:
combined.append(d)
return combined
def check_file_or_dir(path):
if not exists(path):
print("ERROR: PATH IS NOT VALID: " + path)
return False;
else:
if isfile(path):
return "file"
else:
return "dir"
def is_valid_dir(path):
if exists(path):
# the path is a valid path
if not isfile(path):
# its a valid directory
return True
else:
# its a valid file, but we want directories
return False
else:
# the path doesnt exist
return False
def create_google_token():
store = file.Storage('credentials.json')
creds = store.get()
if not creds or creds.invalid:
flow = client.flow_from_clientsecrets('client_secret.json', SCOPES)
creds = tools.run_flow(flow, store)
# service resource is the connection to google drive
service = build('drive', 'v3', http=creds.authorize(Http()))
return service
1 file at a time with a request to the API with each file and its very slow.
Uploading does take time.
Is there a faster way I can accomplish it, maybe with a single request instead of a request for each file?
There is no batch method for uploading files. you will need to upload your files one at a time as you are doing now. Remember there is a quota limit on this your only going to be able upload so fast. You could consider multi threading this and running a version of your script for each of the files to upload. However i wouldn't advise this as its going to be the same user and your going to end up having quota and flooding errors.
Note: you can batch the metadata upload but thats really not going to solve your problem batching request.

Check local files vs what's on S3, and upload any that have changed

I am using this python script to upload files that have changed or newly created from local folder to S3 folder.
The script does not work. It just failed at getting bucket name. I am using boto with python2.7. I have googled but couldnt get the answer.
Any help much appreciated.
Many thanks.
Here is the error
Traceback (most recent call last):
File "s3update.py", line 20, in <module>
bucket = conn.get_bucket(BUCKET_NAME)
File "/usr/lib/python2.7/site-packages/boto/s3/connection.py", line 506, in get_bucket
return self.head_bucket(bucket_name, headers=headers)
File "/usr/lib/python2.7/site-packages/boto/s3/connection.py", line 539, in head_bucket
raise err
boto.exception.S3ResponseError: S3ResponseError: 403 Forbidden
Here is the code
#!/usr/bin/env python
# Compare a file on S3 to see if we have the latest version
# If not, upload it and invalidate CloudFront
import fnmatch
import os
import boto
import pprint
import re
import hashlib
from boto.s3.key import Key
# Where source is checked out
SOURCE_DIR = '/Downloads/local/folder'
BUCKET_NAME = 's3bucket'
# Connect to S3 and get bucket
conn = boto.connect_s3()
bucket = conn.get_bucket('s3bucket')
# Shortcut to MD5
def get_md5(filename):
f = open(filename, 'rb')
m = hashlib.md5()
while True:
data = f.read(10240)
if len(data) == 0:
break
m.update(data)
return m.hexdigest()
def to_uri(filename):
return re.sub(SOURCE_DIR, '', f)
# Assemble a list of all files from SOURCE_DIR
files = []
for root, dirnames, filenames in os.walk(SOURCE_DIR):
for filename in filenames:
files.append(os.path.join(root, filename))
# Compare them to S3 checksums
files_to_upload = []
for f in files:
uri = to_uri(f)
key = bucket.get_key(uri)
if key is None:
# new file, upload
files_to_upload.append(f)
else:
# check MD5
md5 = get_md5(f)
etag = key.etag.strip('"').strip("'")
if etag != md5:
print(f + ": " + md5 + " != " + etag)
files_to_upload.append(f)
# Upload + invalidate the ones that are different
for f in files_to_upload:
uri = to_uri(f)
key = Key(bucket)
key.key = uri
key.set_contents_from_filename(f)
# CloudFront invalidation code goes here
You should make sure your bucket name is correct and exists as well as that you have permission to access it.
I had an error similar to this and it was because while my user permissions did allow all access to anything inside the bucket, it did not allow actions at the bucket level, like listing files / objects in it.
For me on the bucket permissions tab after running the policy generator it originally read {
...
"Action": "s3:*",
"Resource": "arn:aws:s3:::py-sync-s3/*"
]
...
And I had to change it too:
...
"Action": "s3:*",
"Resource": [
"arn:aws:s3:::py-sync-s3",
"arn:aws:s3:::py-sync-s3/*"
]
...
py-sync-s3 being the bucket name.
I know for actions Ideally you would not want to allow all actions with * this was just for testing.

Boto3 to download all files from a S3 Bucket

I'm using boto3 to get files from s3 bucket. I need a similar functionality like aws s3 sync
My current code is
#!/usr/bin/python
import boto3
s3=boto3.client('s3')
list=s3.list_objects(Bucket='my_bucket_name')['Contents']
for key in list:
s3.download_file('my_bucket_name', key['Key'], key['Key'])
This is working fine, as long as the bucket has only files.
If a folder is present inside the bucket, its throwing an error
Traceback (most recent call last):
File "./test", line 6, in <module>
s3.download_file('my_bucket_name', key['Key'], key['Key'])
File "/usr/local/lib/python2.7/dist-packages/boto3/s3/inject.py", line 58, in download_file
extra_args=ExtraArgs, callback=Callback)
File "/usr/local/lib/python2.7/dist-packages/boto3/s3/transfer.py", line 651, in download_file
extra_args, callback)
File "/usr/local/lib/python2.7/dist-packages/boto3/s3/transfer.py", line 666, in _download_file
self._get_object(bucket, key, filename, extra_args, callback)
File "/usr/local/lib/python2.7/dist-packages/boto3/s3/transfer.py", line 690, in _get_object
extra_args, callback)
File "/usr/local/lib/python2.7/dist-packages/boto3/s3/transfer.py", line 707, in _do_get_object
with self._osutil.open(filename, 'wb') as f:
File "/usr/local/lib/python2.7/dist-packages/boto3/s3/transfer.py", line 323, in open
return open(filename, mode)
IOError: [Errno 2] No such file or directory: 'my_folder/.8Df54234'
Is this a proper way to download a complete s3 bucket using boto3. How to download folders.
I have the same needs and created the following function that download recursively the files.
The directories are created locally only if they contain files.
import boto3
import os
def download_dir(client, resource, dist, local='/tmp', bucket='your_bucket'):
paginator = client.get_paginator('list_objects')
for result in paginator.paginate(Bucket=bucket, Delimiter='/', Prefix=dist):
if result.get('CommonPrefixes') is not None:
for subdir in result.get('CommonPrefixes'):
download_dir(client, resource, subdir.get('Prefix'), local, bucket)
for file in result.get('Contents', []):
dest_pathname = os.path.join(local, file.get('Key'))
if not os.path.exists(os.path.dirname(dest_pathname)):
os.makedirs(os.path.dirname(dest_pathname))
if not file.get('Key').endswith('/'):
resource.meta.client.download_file(bucket, file.get('Key'), dest_pathname)
The function is called that way:
def _start():
client = boto3.client('s3')
resource = boto3.resource('s3')
download_dir(client, resource, 'clientconf/', '/tmp', bucket='my-bucket')
When working with buckets that have 1000+ objects its necessary to implement a solution that uses the NextContinuationToken on sequential sets of, at most, 1000 keys. This solution first compiles a list of objects then iteratively creates the specified directories and downloads the existing objects.
import boto3
import os
s3_client = boto3.client('s3')
def download_dir(prefix, local, bucket, client=s3_client):
"""
params:
- prefix: pattern to match in s3
- local: local path to folder in which to place files
- bucket: s3 bucket with target contents
- client: initialized s3 client object
"""
keys = []
dirs = []
next_token = ''
base_kwargs = {
'Bucket':bucket,
'Prefix':prefix,
}
while next_token is not None:
kwargs = base_kwargs.copy()
if next_token != '':
kwargs.update({'ContinuationToken': next_token})
results = client.list_objects_v2(**kwargs)
contents = results.get('Contents')
for i in contents:
k = i.get('Key')
if k[-1] != '/':
keys.append(k)
else:
dirs.append(k)
next_token = results.get('NextContinuationToken')
for d in dirs:
dest_pathname = os.path.join(local, d)
if not os.path.exists(os.path.dirname(dest_pathname)):
os.makedirs(os.path.dirname(dest_pathname))
for k in keys:
dest_pathname = os.path.join(local, k)
if not os.path.exists(os.path.dirname(dest_pathname)):
os.makedirs(os.path.dirname(dest_pathname))
client.download_file(bucket, k, dest_pathname)
import os
import boto3
#initiate s3 resource
s3 = boto3.resource('s3')
# select bucket
my_bucket = s3.Bucket('my_bucket_name')
# download file into current directory
for s3_object in my_bucket.objects.all():
# Need to split s3_object.key into path and file name, else it will give error file not found.
path, filename = os.path.split(s3_object.key)
my_bucket.download_file(s3_object.key, filename)
Amazon S3 does not have folders/directories. It is a flat file structure.
To maintain the appearance of directories, path names are stored as part of the object Key (filename). For example:
images/foo.jpg
In this case, the whole Key is images/foo.jpg, rather than just foo.jpg.
I suspect that your problem is that boto is returning a file called my_folder/.8Df54234 and is attempting to save it to the local filesystem. However, your local filesystem interprets the my_folder/ portion as a directory name, and that directory does not exist on your local filesystem.
You could either truncate the filename to only save the .8Df54234 portion, or you would have to create the necessary directories before writing files. Note that it could be multi-level nested directories.
An easier way would be to use the AWS Command-Line Interface (CLI), which will do all this work for you, eg:
aws s3 cp --recursive s3://my_bucket_name local_folder
There's also a sync option that will only copy new and modified files.
I'm currently achieving the task, by using the following
#!/usr/bin/python
import boto3
s3=boto3.client('s3')
list=s3.list_objects(Bucket='bucket')['Contents']
for s3_key in list:
s3_object = s3_key['Key']
if not s3_object.endswith("/"):
s3.download_file('bucket', s3_object, s3_object)
else:
import os
if not os.path.exists(s3_object):
os.makedirs(s3_object)
Although, it does the job, I'm not sure its good to do this way.
I'm leaving it here to help other users and further answers, with better manner of achieving this
Better late than never:) The previous answer with paginator is really good. However it is recursive, and you might end up hitting Python's recursion limits. Here's an alternate approach, with a couple of extra checks.
import os
import errno
import boto3
def assert_dir_exists(path):
"""
Checks if directory tree in path exists. If not it created them.
:param path: the path to check if it exists
"""
try:
os.makedirs(path)
except OSError as e:
if e.errno != errno.EEXIST:
raise
def download_dir(client, bucket, path, target):
"""
Downloads recursively the given S3 path to the target directory.
:param client: S3 client to use.
:param bucket: the name of the bucket to download from
:param path: The S3 directory to download.
:param target: the local directory to download the files to.
"""
# Handle missing / at end of prefix
if not path.endswith('/'):
path += '/'
paginator = client.get_paginator('list_objects_v2')
for result in paginator.paginate(Bucket=bucket, Prefix=path):
# Download each file individually
for key in result['Contents']:
# Calculate relative path
rel_path = key['Key'][len(path):]
# Skip paths ending in /
if not key['Key'].endswith('/'):
local_file_path = os.path.join(target, rel_path)
# Make sure directories exist
local_file_dir = os.path.dirname(local_file_path)
assert_dir_exists(local_file_dir)
client.download_file(bucket, key['Key'], local_file_path)
client = boto3.client('s3')
download_dir(client, 'bucket-name', 'path/to/data', 'downloads')
A lot of the solutions here get pretty complicated. If you're looking for something simpler, cloudpathlib wraps things in a nice way for this use case that will download directories or files.
from cloudpathlib import CloudPath
cp = CloudPath("s3://bucket/product/myproject/2021-02-15/")
cp.download_to("local_folder")
Note: for large folders with lots of files, awscli at the command line is likely faster.
I have a workaround for this that runs the AWS CLI in the same process.
Install awscli as python lib:
pip install awscli
Then define this function:
from awscli.clidriver import create_clidriver
def aws_cli(*cmd):
old_env = dict(os.environ)
try:
# Environment
env = os.environ.copy()
env['LC_CTYPE'] = u'en_US.UTF'
os.environ.update(env)
# Run awscli in the same process
exit_code = create_clidriver().main(*cmd)
# Deal with problems
if exit_code > 0:
raise RuntimeError('AWS CLI exited with code {}'.format(exit_code))
finally:
os.environ.clear()
os.environ.update(old_env)
To execute:
aws_cli('s3', 'sync', '/path/to/source', 's3://bucket/destination', '--delete')
import boto3, os
s3 = boto3.client('s3')
def download_bucket(bucket):
paginator = s3.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket)
for page in pages:
if 'Contents' in page:
for obj in page['Contents']:
os.path.dirname(obj['Key']) and os.makedirs(os.path.dirname(obj['Key']), exist_ok=True)
try:
s3.download_file(bucket, obj['Key'], obj['Key'])
except NotADirectoryError:
pass
# Change bucket_name to name of bucket that you want to download
download_bucket(bucket_name)
This should work for all number of objects (also when there are more than 1000). Each paginator page can contain up to 1000 objects.Notice extra param in os.makedirs function - exist_ok=True which cause that it's not throwing error when path exist)
I've updated Grant's answer to run in parallel, it's much faster if anyone is interested:
from concurrent import futures
import os
import boto3
def download_dir(prefix, local, bucket):
client = boto3.client('s3')
def create_folder_and_download_file(k):
dest_pathname = os.path.join(local, k)
if not os.path.exists(os.path.dirname(dest_pathname)):
os.makedirs(os.path.dirname(dest_pathname))
print(f'downloading {k} to {dest_pathname}')
client.download_file(bucket, k, dest_pathname)
keys = []
dirs = []
next_token = ''
base_kwargs = {
'Bucket': bucket,
'Prefix': prefix,
}
while next_token is not None:
kwargs = base_kwargs.copy()
if next_token != '':
kwargs.update({'ContinuationToken': next_token})
results = client.list_objects_v2(**kwargs)
contents = results.get('Contents')
for i in contents:
k = i.get('Key')
if k[-1] != '/':
keys.append(k)
else:
dirs.append(k)
next_token = results.get('NextContinuationToken')
for d in dirs:
dest_pathname = os.path.join(local, d)
if not os.path.exists(os.path.dirname(dest_pathname)):
os.makedirs(os.path.dirname(dest_pathname))
with futures.ThreadPoolExecutor() as executor:
futures.wait(
[executor.submit(create_folder_and_download_file, k) for k in keys],
return_when=futures.FIRST_EXCEPTION,
)
Yet another parallel downloader using asyncio/aioboto
import os, time
import asyncio
from itertools import chain
import json
from typing import List
from json.decoder import WHITESPACE
import logging
from functools import partial
from pprint import pprint as pp
# Third Party
import asyncpool
import aiobotocore.session
import aiobotocore.config
_NUM_WORKERS = 50
bucket_name= 'test-data'
bucket_prefix= 'etl2/test/20210330/f_api'
async def save_to_file(s3_client, bucket: str, key: str):
response = await s3_client.get_object(Bucket=bucket, Key=key)
async with response['Body'] as stream:
content = await stream.read()
if 1:
fn =f'out/downloaded/{bucket_name}/{key}'
dn= os.path.dirname(fn)
if not isdir(dn):
os.makedirs(dn,exist_ok=True)
if 1:
with open(fn, 'wb') as fh:
fh.write(content)
print(f'Downloaded to: {fn}')
return [0]
async def go(bucket: str, prefix: str) -> List[dict]:
"""
Returns list of dicts of object contents
:param bucket: s3 bucket
:param prefix: s3 bucket prefix
:return: list of download statuses
"""
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
session = aiobotocore.session.AioSession()
config = aiobotocore.config.AioConfig(max_pool_connections=_NUM_WORKERS)
contents = []
async with session.create_client('s3', config=config) as client:
worker_co = partial(save_to_file, client, bucket)
async with asyncpool.AsyncPool(None, _NUM_WORKERS, 's3_work_queue', logger, worker_co,
return_futures=True, raise_on_join=True, log_every_n=10) as work_pool:
# list s3 objects using paginator
paginator = client.get_paginator('list_objects')
async for result in paginator.paginate(Bucket=bucket, Prefix=prefix):
for c in result.get('Contents', []):
contents.append(await work_pool.push(c['Key'], client))
# retrieve results from futures
contents = [c.result() for c in contents]
return list(chain.from_iterable(contents))
def S3_download_bucket_files():
s = time.perf_counter()
_loop = asyncio.get_event_loop()
_result = _loop.run_until_complete(go(bucket_name, bucket_prefix))
assert sum(_result)==0, _result
print(_result)
elapsed = time.perf_counter() - s
print(f"{__file__} executed in {elapsed:0.2f} seconds.")
It will fetch list of files from S3 first and then download using aioboto with _NUM_WORKERS=50 reading data in parallel from the network.
If you want to call a bash script using python, here is a simple method to load a file from a folder in S3 bucket to a local folder (in a Linux machine) :
import boto3
import subprocess
import os
###TOEDIT###
my_bucket_name = "your_my_bucket_name"
bucket_folder_name = "your_bucket_folder_name"
local_folder_path = "your_local_folder_path"
###TOEDIT###
# 1.Load thes list of files existing in the bucket folder
FILES_NAMES = []
s3 = boto3.resource('s3')
my_bucket = s3.Bucket('{}'.format(my_bucket_name))
for object_summary in my_bucket.objects.filter(Prefix="{}/".format(bucket_folder_name)):
# print(object_summary.key)
FILES_NAMES.append(object_summary.key)
# 2.List only new files that do not exist in local folder (to not copy everything!)
new_filenames = list(set(FILES_NAMES )-set(os.listdir(local_folder_path)))
# 3.Time to load files in your destination folder
for new_filename in new_filenames:
upload_S3files_CMD = """aws s3 cp s3://{}/{}/{} {}""".format(my_bucket_name,bucket_folder_name,new_filename ,local_folder_path)
subprocess_call = subprocess.call([upload_S3files_CMD], shell=True)
if subprocess_call != 0:
print("ALERT: loading files not working correctly, please re-check new loaded files")
From AWS S3 Docs (How do I use folders in an S3 bucket?):
In Amazon S3, buckets and objects are the primary resources, and objects are stored in buckets. Amazon S3 has a flat structure instead of a hierarchy like you would see in a file system. However, for the sake of organizational simplicity, the Amazon S3 console supports the folder concept as a means of grouping objects. Amazon S3 does this by using a shared name prefix for objects (that is, objects have names that begin with a common string). Object names are also referred to as key names.
For example, you can create a folder on the console named photos and store an object named myphoto.jpg in it. The object is then stored with the key name photos/myphoto.jpg, where photos/ is the prefix.
To download all files from "mybucket" into the current directory respecting the bucket's emulated directory structure (creating the folders from the bucket if they don't already exist locally):
import boto3
import os
bucket_name = "mybucket"
s3 = boto3.client("s3")
objects = s3.list_objects(Bucket = bucket_name)["Contents"]
for s3_object in objects:
s3_key = s3_object["Key"]
path, filename = os.path.split(s3_key)
if len(path) != 0 and not os.path.exists(path):
os.makedirs(path)
if not s3_key.endswith("/"):
download_to = path + '/' + filename if path else filename
s3.download_file(bucket_name, s3_key, download_to)
It is a very bad idea to get all files in one go, you should rather get it in batches.
One implementation which I use to fetch a particular folder (directory) from S3 is,
def get_directory(directory_path, download_path, exclude_file_names):
# prepare session
session = Session(aws_access_key_id, aws_secret_access_key, region_name)
# get instances for resource and bucket
resource = session.resource('s3')
bucket = resource.Bucket(bucket_name)
for s3_key in self.client.list_objects(Bucket=self.bucket_name, Prefix=directory_path)['Contents']:
s3_object = s3_key['Key']
if s3_object not in exclude_file_names:
bucket.download_file(file_path, download_path + str(s3_object.split('/')[-1])
and still if you want to get the whole bucket use it via CLI as #John Rotenstein mentioned as below,
aws s3 cp --recursive s3://bucket_name download_path
for objs in my_bucket.objects.all():
print(objs.key)
path='/tmp/'+os.sep.join(objs.key.split(os.sep)[:-1])
try:
if not os.path.exists(path):
os.makedirs(path)
my_bucket.download_file(objs.key, '/tmp/'+objs.key)
except FileExistsError as fe:
print(objs.key+' exists')
This code will download the content in /tmp/ directory. If you want you can change the directory.
I got the similar requirement and got help from reading few of the above solutions and across other websites, I have came up with below script, Just wanted to share if it might help anyone.
from boto3.session import Session
import os
def sync_s3_folder(access_key_id,secret_access_key,bucket_name,folder,destination_path):
session = Session(aws_access_key_id=access_key_id,aws_secret_access_key=secret_access_key)
s3 = session.resource('s3')
your_bucket = s3.Bucket(bucket_name)
for s3_file in your_bucket.objects.all():
if folder in s3_file.key:
file=os.path.join(destination_path,s3_file.key.replace('/','\\'))
if not os.path.exists(os.path.dirname(file)):
os.makedirs(os.path.dirname(file))
your_bucket.download_file(s3_file.key,file)
sync_s3_folder(access_key_id,secret_access_key,bucket_name,folder,destination_path)
Reposting #glefait 's answer with an if condition at the end to avoid os error 20. The first key it gets is the folder name itself which cannot be written in the destination path.
def download_dir(client, resource, dist, local='/tmp', bucket='your_bucket'):
paginator = client.get_paginator('list_objects')
for result in paginator.paginate(Bucket=bucket, Delimiter='/', Prefix=dist):
if result.get('CommonPrefixes') is not None:
for subdir in result.get('CommonPrefixes'):
download_dir(client, resource, subdir.get('Prefix'), local, bucket)
for file in result.get('Contents', []):
print("Content: ",result)
dest_pathname = os.path.join(local, file.get('Key'))
print("Dest path: ",dest_pathname)
if not os.path.exists(os.path.dirname(dest_pathname)):
print("here last if")
os.makedirs(os.path.dirname(dest_pathname))
print("else file key: ", file.get('Key'))
if not file.get('Key') == dist:
print("Key not equal? ",file.get('Key'))
resource.meta.client.download_file(bucket, file.get('Key'), dest_pathname)enter code here
I have been running into this problem for a while and with all of the different forums I've been through I haven't see a full end-to-end snip-it of what works. So, I went ahead and took all the pieces (add some stuff on my own) and have created a full end-to-end S3 Downloader!
This will not only download files automatically but if the S3 files are in subdirectories, it will create them on the local storage. In my application's instance, I need to set permissions and owners so I have added that too (can be comment out if not needed).
This has been tested and works in a Docker environment (K8) but I have added the environmental variables in the script just in case you want to test/run it locally.
I hope this helps someone out in their quest of finding S3 Download automation. I also welcome any advice, info, etc. on how this can be better optimized if needed.
#!/usr/bin/python3
import gc
import logging
import os
import signal
import sys
import time
from datetime import datetime
import boto
from boto.exception import S3ResponseError
from pythonjsonlogger import jsonlogger
formatter = jsonlogger.JsonFormatter('%(message)%(levelname)%(name)%(asctime)%(filename)%(lineno)%(funcName)')
json_handler_out = logging.StreamHandler()
json_handler_out.setFormatter(formatter)
#Manual Testing Variables If Needed
#os.environ["DOWNLOAD_LOCATION_PATH"] = "some_path"
#os.environ["BUCKET_NAME"] = "some_bucket"
#os.environ["AWS_ACCESS_KEY"] = "some_access_key"
#os.environ["AWS_SECRET_KEY"] = "some_secret"
#os.environ["LOG_LEVEL_SELECTOR"] = "DEBUG, INFO, or ERROR"
#Setting Log Level Test
logger = logging.getLogger('json')
logger.addHandler(json_handler_out)
logger_levels = {
'ERROR' : logging.ERROR,
'INFO' : logging.INFO,
'DEBUG' : logging.DEBUG
}
logger_level_selector = os.environ["LOG_LEVEL_SELECTOR"]
logger.setLevel(logger_level_selector)
#Getting Date/Time
now = datetime.now()
logger.info("Current date and time : ")
logger.info(now.strftime("%Y-%m-%d %H:%M:%S"))
#Establishing S3 Variables and Download Location
download_location_path = os.environ["DOWNLOAD_LOCATION_PATH"]
bucket_name = os.environ["BUCKET_NAME"]
aws_access_key_id = os.environ["AWS_ACCESS_KEY"]
aws_access_secret_key = os.environ["AWS_SECRET_KEY"]
logger.debug("Bucket: %s" % bucket_name)
logger.debug("Key: %s" % aws_access_key_id)
logger.debug("Secret: %s" % aws_access_secret_key)
logger.debug("Download location path: %s" % download_location_path)
#Creating Download Directory
if not os.path.exists(download_location_path):
logger.info("Making download directory")
os.makedirs(download_location_path)
#Signal Hooks are fun
class GracefulKiller:
kill_now = False
def __init__(self):
signal.signal(signal.SIGINT, self.exit_gracefully)
signal.signal(signal.SIGTERM, self.exit_gracefully)
def exit_gracefully(self, signum, frame):
self.kill_now = True
#Downloading from S3 Bucket
def download_s3_bucket():
conn = boto.connect_s3(aws_access_key_id, aws_access_secret_key)
logger.debug("Connection established: ")
bucket = conn.get_bucket(bucket_name)
logger.debug("Bucket: %s" % str(bucket))
bucket_list = bucket.list()
# logger.info("Number of items to download: {0}".format(len(bucket_list)))
for s3_item in bucket_list:
key_string = str(s3_item.key)
logger.debug("S3 Bucket Item to download: %s" % key_string)
s3_path = download_location_path + "/" + key_string
logger.debug("Downloading to: %s" % s3_path)
local_dir = os.path.dirname(s3_path)
if not os.path.exists(local_dir):
logger.info("Local directory doesn't exist, creating it... %s" % local_dir)
os.makedirs(local_dir)
logger.info("Updating local directory permissions to %s" % local_dir)
#Comment or Uncomment Permissions based on Local Usage
os.chmod(local_dir, 0o775)
os.chown(local_dir, 60001, 60001)
logger.debug("Local directory for download: %s" % local_dir)
try:
logger.info("Downloading File: %s" % key_string)
s3_item.get_contents_to_filename(s3_path)
logger.info("Successfully downloaded File: %s" % s3_path)
#Updating Permissions
logger.info("Updating Permissions for %s" % str(s3_path))
#Comment or Uncomment Permissions based on Local Usage
os.chmod(s3_path, 0o664)
os.chown(s3_path, 60001, 60001)
except (OSError, S3ResponseError) as e:
logger.error("Fatal error in s3_item.get_contents_to_filename", exc_info=True)
# logger.error("Exception in file download from S3: {}".format(e))
continue
logger.info("Deleting %s from S3 Bucket" % str(s3_item.key))
s3_item.delete()
def main():
killer = GracefulKiller()
while not killer.kill_now:
logger.info("Checking for new files on S3 to download...")
download_s3_bucket()
logger.info("Done checking for new files, will check in 120s...")
gc.collect()
sys.stdout.flush()
time.sleep(120)
if __name__ == '__main__':
main()
There are very minor differences in the way S3 organizes files and the way Windows does.
Here is a simple self-documenting example that accounts for those differences.
Also: Think of amazon file names as a normal string. They don't really represent a folder. Amazon SIMULATES folders, so if you try to just shove a file into a NAME of a folder that doesn't exist on your system, it cannot figure out where to place it. So you must MAKE a folder on your system for each simulated folder from S3. If you have a folder within a folder, don't use "mkdir(path)" it won't work. You have to use "makedirs(path)". ANOTHER THING! -> PC file paths are weirdly formatted. Amazon uses "/" and pc uses "\" and it MUST be the same for the whole file name. Check out my code block below (WHICH SHOWS AUTHENTICATION TOO).
In my example, I have a folder in my bucket called "iTovenGUIImages/gui_media". I want to put it in a folder on my system that MAY not exist yet. The folder on my system has it's own special prefix that can be whatever you want in your system as long as it's formatted like a folder path.
import boto3
import cred
import os
locale_file_Imagedirectory = r"C:\\Temp\\App Data\\iToven AI\\" # This is where all GUI files for iToven AI exist on PC
def downloadImageDirectoryS3(remoteDirectoryName, desired_parent_folder):
my_bucket = 'itovenbucket'
s3_resource = boto3.resource('s3', aws_access_key_id=cred.AWSAccessKeyId,
aws_secret_access_key=cred.AWSSecretKey)
bucket = s3_resource.Bucket(my_bucket)
for obj in bucket.objects.filter(Prefix=remoteDirectoryName):
pcVersionPrefix = remoteDirectoryName.replace("/", r"\\")
isolatedFileName = obj.key.replace(remoteDirectoryName, "")
clientSideFileName = desired_parent_folder+pcVersionPrefix+isolatedFileName
print(clientSideFileName) # Client-Side System File Structure
if not os.path.exists(desired_parent_folder+pcVersionPrefix): # CREATE DIRECTORIES FOR EACH FOLDER RECURSIVELY
os.makedirs(desired_parent_folder+pcVersionPrefix)
if obj.key not in desired_parent_folder+pcVersionPrefix:
bucket.download_file(obj.key, clientSideFileName) # save to new path
downloadImageDirectoryS3(r"iTovenGUIImagesPC/gui_media/", locale_file_Imagedirectory)

Categories

Resources