Read multiple CSV files from s3 using boto3 - python

The code below reads a CSV file from AWS s3 using Pycham on my local machine.
# Read CSV from s3
import os
import boto3
import pandas as pd
import sys
if sys.version_info[0] < 3:
from StringIO import StringIO # Python 2.x
else:
from io import StringIO
aws_id = 'XXXXXXXXXXXXXXX'
aws_secret = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
client = boto3.client('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret)
bucket_name = 'bucket-name'
object_key = 'folder-name/test.csv'
csv_obj = client.get_object(Bucket=bucket_name, Key=object_key)
body = csv_obj['Body']
csv_string = body.read().decode('utf-8')
df = pd.read_csv(StringIO(csv_string))
x = df.head()
print(x)
I would like to be able to read multiple CSV files in the same way. Pretty much anything that is in the folder.
My files are in the following directory:
bucket-name/folder-name/year=2018/month=01/file_032342.csv
bucket-name/folder-name/year=2018/month=02/file_434423.csv
bucket-name/folder-name/year=2018/month=03/file_343254.csv
bucket-name/folder-name/year=2018/month=04/file_544353.csv

Related

Read xlsx from azure blob storage to pandas dataframe without creating temporary file

I am trying to read a xlsx file from an Azure blob storage to a pandas dataframe without creating a temporary local file. I have seen many similar questions,
e.g. Issues Reading Azure Blob CSV Into Python Pandas DF, but haven't managed to get the proposed solutions to work.
Below code snippet results in a UnicodeDecodeError: 'utf-8' codec can't decode byte 0x87 in position 14: invalid start byte exception.
from io import StringIO
import pandas as pd
from azure.storage.blob import BlobClient, BlobServiceClient
blob_client = BlobClient.from_blob_url(blob_url = url + container + "/" + blobname, credential = token)
blob = blob_client.download_blob().content_as_text()
df = pd.read_excel(StringIO(blob))
Using a temporary file, I do manage to make it work with the following code snippet:
blob_service_client = BlobServiceClient(account_url = url, credential = token)
blob_client = blob_service_client.get_blob_client(container=container, blob=blobname)
with open(tmpfile, "wb") as my_blob:
download_stream = blob_client.download_blob()
my_blob.write(download_stream.readall())
data = pd.read_excel(tmpfile)
Similar to what you have already done, we could use download_blob() to get the StorageStreamDownloader object into memory, then context_as_text() to decode the contents to a string.
Then we can read the the contents from the CSV StringIO buffer into a pandas Dataframe with pandas.read_csv().
from io import StringIO
import pandas as pd
from azure.storage.blob import BlobClient, BlobServiceClient
import os
connection_string = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
blob_client = blob_service_client.get_blob_client(container="blobs", blob="test.csv")
blob = blob_client.download_blob().content_as_text()
df = pd.read_csv(StringIO(blob))
Update
If we are working with XLSX files, use content_as_bytes() to return bytes instead of a string, and convert to a pandas dataframe with pandas.read_excel():
from io import StringIO
import pandas as pd
from azure.storage.blob import BlobClient, BlobServiceClient
import os
connection_string = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
blob_client = blob_service_client.get_blob_client(container="blobs", blob="test.xlsx")
blob = blob_client.download_blob().content_as_bytes()
df = pd.read_excel(blob)
Since content_as_text() uses UTF-8 encoding by default, this is probably causing the UnicodeDecodeError when decoding bytes.
We could still use this with pandas.read_excel() if we set the encoding to None:
blob = blob_client.download_blob().content_as_text(encoding=None)
df = pd.read_excel(blob)

Python: Read CSV file without Pandas from S3 bucket [duplicate]

I have uploaded an excel file to AWS S3 bucket and now I want to read it in python. Any help would be appreciated. Here is what I have achieved so far,
import boto3
import os
aws_id = 'aws_id'
aws_secret = 'aws_secret_key'
client = boto3.client('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret)
bucket_name = 'my_bucket'
object_key = 'my_excel_file.xlsm'
object_file = client.get_object(Bucket=bucket_name, Key=object_key)
body = object_file['Body']
data = body.read()
What do I need to do next in order to read this data and work on it?
Spent quite some time on it and here's how I got it working,
import boto3
import io
import pandas as pd
import json
aws_id = ''
aws_secret = ''
bucket_name = ''
object_key = ''
s3 = boto3.client('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret)
obj = s3.get_object(Bucket=bucket_name, Key=object_key)
data = obj['Body'].read()
df = pd.read_excel(io.BytesIO(data), encoding='utf-8')
You can directly read xls file from S3 without having to download or save it locally. xlrd module has a provision to provide raw data to create workbook object.
Following is the code snippet.
from boto3 import Session
from xlrd.book import open_workbook_xls
aws_id = ''
aws_secret = ''
bucket_name = ''
object_key = ''
s3_session = Session(aws_access_key_id=aws_id, aws_secret_access_key=aws_secret)
bucket_object = s3_session.resource('s3').Bucket(bucket_name).Object(object_key)
content = bucket_object.get()['Body'].read()
workbook = open_workbook_xls(file_contents=content)
You can directly read excel files using awswrangler.s3.read_excel. Note that you can pass any pandas.read_excel() arguments (sheet name, etc) to this.
import awswrangler as wr
df = wr.s3.read_excel(path=s3_uri)
Python doesn't support excel files natively. You could use the pandas library pandas library read_excel functionality

upload a dataframe as a zipped csv directly to s3 without saving it on the local machine

How can I upload a data frame as a zipped csv into S3 bucket without saving it on my local machine first?
I have the connection to that bucket already running using:
self.s3_output = S3(bucket_name='test-bucket', bucket_subfolder='')
We can make a file-like object with BytesIO and zipfile from the standard library.
# 3.7
from io import BytesIO
import zipfile
# .to_csv returns a string when called with no args
s = df.to_csv()
with zipfile.ZipFile(BytesIO(), mode="w",) as z:
z.writestr("df.csv", s)
# upload file here
You'll want to refer to upload_fileobj in order to customize how the upload behaves.
yourclass.s3_output.upload_fileobj(z, ...)
This works equally well for zip and gz:
import boto3
import gzip
import pandas as pd
from io import BytesIO, TextIOWrapper
s3_client = boto3.client(
service_name = "s3",
endpoint_url = your_endpoint_url,
aws_access_key_id = your_access_key,
aws_secret_access_key = your_secret_key
# Your file name inside zip
your_filename = "test.csv"
s3_path = f"path/to/your/s3/compressed/file/test.zip"
bucket = "your_bucket"
df = your_df
gz_buffer = BytesIO()
with gzip.GzipFile(
filename = your_filename,
mode = 'w',
fileobj = gz_buffer ) as gz_file:
df.to_csv(TextIOWrapper(gz_file, 'utf8'), index=False)
s3.put_object(
Bucket=bucket, Key=s3_path, Body=gz_buffer.getvalue()
)

How to read a list of parquet files from S3 as a pandas dataframe using pyarrow?

I have a hacky way of achieving this using boto3 (1.4.4), pyarrow (0.4.1) and pandas (0.20.3).
First, I can read a single parquet file locally like this:
import pyarrow.parquet as pq
path = 'parquet/part-r-00000-1e638be4-e31f-498a-a359-47d017a0059c.gz.parquet'
table = pq.read_table(path)
df = table.to_pandas()
I can also read a directory of parquet files locally like this:
import pyarrow.parquet as pq
dataset = pq.ParquetDataset('parquet/')
table = dataset.read()
df = table.to_pandas()
Both work like a charm. Now I want to achieve the same remotely with files stored in a S3 bucket. I was hoping that something like this would work:
dataset = pq.ParquetDataset('s3n://dsn/to/my/bucket')
But it does not:
OSError: Passed non-file path: s3n://dsn/to/my/bucket
After reading pyarrow's documentation thoroughly, this does not seem possible at the moment. So I came out with the following solution:
Reading a single file from S3 and getting a pandas dataframe:
import io
import boto3
import pyarrow.parquet as pq
buffer = io.BytesIO()
s3 = boto3.resource('s3')
s3_object = s3.Object('bucket-name', 'key/to/parquet/file.gz.parquet')
s3_object.download_fileobj(buffer)
table = pq.read_table(buffer)
df = table.to_pandas()
And here my hacky, not-so-optimized, solution to create a pandas dataframe from a S3 folder path:
import io
import boto3
import pandas as pd
import pyarrow.parquet as pq
bucket_name = 'bucket-name'
def download_s3_parquet_file(s3, bucket, key):
buffer = io.BytesIO()
s3.Object(bucket, key).download_fileobj(buffer)
return buffer
client = boto3.client('s3')
s3 = boto3.resource('s3')
objects_dict = client.list_objects_v2(Bucket=bucket_name, Prefix='my/folder/prefix')
s3_keys = [item['Key'] for item in objects_dict['Contents'] if item['Key'].endswith('.parquet')]
buffers = [download_s3_parquet_file(s3, bucket_name, key) for key in s3_keys]
dfs = [pq.read_table(buffer).to_pandas() for buffer in buffers]
df = pd.concat(dfs, ignore_index=True)
Is there a better way to achieve this? Maybe some kind of connector for pandas using pyarrow? I would like to avoid using pyspark, but if there is no other solution, then I would take it.
You should use the s3fs module as proposed by yjk21. However as result of calling ParquetDataset you'll get a pyarrow.parquet.ParquetDataset object. To get the Pandas DataFrame you'll rather want to apply .read_pandas().to_pandas() to it:
import pyarrow.parquet as pq
import s3fs
s3 = s3fs.S3FileSystem()
pandas_dataframe = pq.ParquetDataset('s3://your-bucket/', filesystem=s3).read_pandas().to_pandas()
Thanks! Your question actually tell me a lot. This is how I do it now with pandas (0.21.1), which will call pyarrow, and boto3 (1.3.1).
import boto3
import io
import pandas as pd
# Read single parquet file from S3
def pd_read_s3_parquet(key, bucket, s3_client=None, **args):
if s3_client is None:
s3_client = boto3.client('s3')
obj = s3_client.get_object(Bucket=bucket, Key=key)
return pd.read_parquet(io.BytesIO(obj['Body'].read()), **args)
# Read multiple parquets from a folder on S3 generated by spark
def pd_read_s3_multiple_parquets(filepath, bucket, s3=None,
s3_client=None, verbose=False, **args):
if not filepath.endswith('/'):
filepath = filepath + '/' # Add '/' to the end
if s3_client is None:
s3_client = boto3.client('s3')
if s3 is None:
s3 = boto3.resource('s3')
s3_keys = [item.key for item in s3.Bucket(bucket).objects.filter(Prefix=filepath)
if item.key.endswith('.parquet')]
if not s3_keys:
print('No parquet found in', bucket, filepath)
elif verbose:
print('Load parquets:')
for p in s3_keys:
print(p)
dfs = [pd_read_s3_parquet(key, bucket=bucket, s3_client=s3_client, **args)
for key in s3_keys]
return pd.concat(dfs, ignore_index=True)
Then you can read multiple parquets under a folder from S3 by
df = pd_read_s3_multiple_parquets('path/to/folder', 'my_bucket')
(One can simplify this code a lot I guess.)
It can be done using boto3 as well without the use of pyarrow
import boto3
import io
import pandas as pd
# Read the parquet file
buffer = io.BytesIO()
s3 = boto3.resource('s3')
object = s3.Object('bucket_name','key')
object.download_fileobj(buffer)
df = pd.read_parquet(buffer)
print(df.head())
Probably the easiest way to read parquet data on the cloud into dataframes is to use dask.dataframe in this way:
import dask.dataframe as dd
df = dd.read_parquet('s3://bucket/path/to/data-*.parq')
dask.dataframe can read from Google Cloud Storage, Amazon S3, Hadoop file system and more!
Provided you have the right package setup
$ pip install pandas==1.1.0 pyarrow==1.0.0 s3fs==0.4.2
and your AWS shared config and credentials files configured appropriately
you can use pandas right away:
import pandas as pd
df = pd.read_parquet("s3://bucket/key.parquet")
In case of having multiple AWS profiles you may also need to set
$ export AWS_DEFAULT_PROFILE=profile_under_which_the_bucket_is_accessible
so you can access your bucket.
If you are open to also use AWS Data Wrangler.
import awswrangler as wr
df = wr.s3.read_parquet(path="s3://...")
You can use s3fs from dask which implements a filesystem interface for s3. Then you can use the filesystem argument of ParquetDataset like so:
import s3fs
s3 = s3fs.S3FileSystem()
dataset = pq.ParquetDataset('s3n://dsn/to/my/bucket', filesystem=s3)
Using pre-signed URLs
s3 =s3fs.S3FileSystem(key='your_key',secret='your_secret',client_kwargs={"endpoint_url":'your_end_point'})
df = dd.read_parquet(s3.url('your_bucket' + 'your_filepath',expires=3600,client_method='get_object'))
I have tried the #oya163 solution and it works but after little bit change
import boto3
import io
import pandas as pd
# Read the parquet file
buffer = io.BytesIO()
s3 = boto3.resource('s3',aws_access_key_id='123',aws_secret_access_key= '456')
object = s3.Object('bucket_name','myoutput.parquet')
object.download_fileobj(buffer)
df = pd.read_parquet(buffer)
print(df.head())

Save Dataframe to csv directly to s3 Python

I have a pandas DataFrame that I want to upload to a new CSV file. The problem is that I don't want to save the file locally before transferring it to s3. Is there any method like to_csv for writing the dataframe to s3 directly? I am using boto3.
Here is what I have so far:
import boto3
s3 = boto3.client('s3', aws_access_key_id='key', aws_secret_access_key='secret_key')
read_file = s3.get_object(Bucket, Key)
df = pd.read_csv(read_file['Body'])
# Make alterations to DataFrame
# Then export DataFrame to CSV through direct transfer to s3
You can use:
from io import StringIO # python3; python2: BytesIO
import boto3
bucket = 'my_bucket_name' # already created on S3
csv_buffer = StringIO()
df.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'df.csv').put(Body=csv_buffer.getvalue())
You can directly use the S3 path. I am using Pandas 0.24.1
In [1]: import pandas as pd
In [2]: df = pd.DataFrame( [ [1, 1, 1], [2, 2, 2] ], columns=['a', 'b', 'c'])
In [3]: df
Out[3]:
a b c
0 1 1 1
1 2 2 2
In [4]: df.to_csv('s3://experimental/playground/temp_csv/dummy.csv', index=False)
In [5]: pd.__version__
Out[5]: '0.24.1'
In [6]: new_df = pd.read_csv('s3://experimental/playground/temp_csv/dummy.csv')
In [7]: new_df
Out[7]:
a b c
0 1 1 1
1 2 2 2
Release Note:
S3 File Handling
pandas now uses s3fs for handling S3 connections. This shouldn’t break any code. However, since s3fs is not a required dependency, you will need to install it separately, like boto in prior versions of pandas. GH11915.
I like s3fs which lets you use s3 (almost) like a local filesystem.
You can do this:
import s3fs
bytes_to_write = df.to_csv(None).encode()
fs = s3fs.S3FileSystem(key=key, secret=secret)
with fs.open('s3://bucket/path/to/file.csv', 'wb') as f:
f.write(bytes_to_write)
s3fs supports only rb and wb modes of opening the file, that's why I did this bytes_to_write stuff.
This is a more up to date answer:
import s3fs
s3 = s3fs.S3FileSystem(anon=False)
# Use 'w' for py3, 'wb' for py2
with s3.open('<bucket-name>/<filename>.csv','w') as f:
df.to_csv(f)
The problem with StringIO is that it will eat away at your memory. With this method, you are streaming the file to s3, rather than converting it to string, then writing it into s3. Holding the pandas dataframe and its string copy in memory seems very inefficient.
If you are working in an ec2 instant, you can give it an IAM role to enable writing it to s3, thus you dont need to pass in credentials directly. However, you can also connect to a bucket by passing credentials to the S3FileSystem() function. See documention:https://s3fs.readthedocs.io/en/latest/
If you pass None as the first argument to to_csv() the data will be returned as a string. From there it's an easy step to upload that to S3 in one go.
It should also be possible to pass a StringIO object to to_csv(), but using a string will be easier.
You can also use the AWS Data Wrangler:
import awswrangler as wr
wr.s3.to_csv(
df=df,
path="s3://...",
)
Note that it will handle multipart upload for you to make the upload faster.
I found this can be done using client also and not just resource.
from io import StringIO
import boto3
s3 = boto3.client("s3",\
region_name=region_name,\
aws_access_key_id=aws_access_key_id,\
aws_secret_access_key=aws_secret_access_key)
csv_buf = StringIO()
df.to_csv(csv_buf, header=True, index=False)
csv_buf.seek(0)
s3.put_object(Bucket=bucket, Body=csv_buf.getvalue(), Key='path/test.csv')
I use AWS Data Wrangler. For example:
import awswrangler as wr
import pandas as pd
# read a local dataframe
df = pd.read_parquet('my_local_file.gz')
# upload to S3 bucket
wr.s3.to_parquet(df=df, path='s3://mys3bucket/file_name.gz')
The same applies to csv files. Instead of read_parquet and to_parquet, use read_csv and to_csv with the proper file extension.
since you are using boto3.client(), try:
import boto3
from io import StringIO #python3
s3 = boto3.client('s3', aws_access_key_id='key', aws_secret_access_key='secret_key')
def copy_to_s3(client, df, bucket, filepath):
csv_buf = StringIO()
df.to_csv(csv_buf, header=True, index=False)
csv_buf.seek(0)
client.put_object(Bucket=bucket, Body=csv_buf.getvalue(), Key=filepath)
print(f'Copy {df.shape[0]} rows to S3 Bucket {bucket} at {filepath}, Done!')
copy_to_s3(client=s3, df=df_to_upload, bucket='abc', filepath='def/test.csv')
You can use
pandas
boto3
s3fs (version ≤0.4)
I use to_csv with s3:// in path and storage_options
key = "folder/file.csv"
df.to_csv(
f"s3://{YOUR_S3_BUCKET}/{key}",
index=False,
storage_options={
"key": AWS_ACCESS_KEY_ID,
"secret": AWS_SECRET_ACCESS_KEY,
"token": AWS_SESSION_TOKEN,
},
To handle large files efficiently you can also use an open-source S3-compatible MinIO, with its minio python client package, like in this function of mine:
import minio
import os
import pandas as pd
minio_client = minio.Minio(..)
def write_df_to_minio(df,
minio_client,
bucket_name,
file_name="new-file.csv",
local_temp_folder="/tmp/",
content_type="application/csv",
sep=",",
save_row_index=False):
df.to_csv(os.path.join(local_temp_folder, file_name), sep=sep, index=save_row_index)
minio_results = minio_client.fput_object(bucket_name=bucket_name,
object_name=file_name,
file_path=os.path.join(local_temp_folder, file_name),
content_type=content_type)
assert minio_results.object_name == file_name
Another option is to do this with cloudpathlib, which supports S3 and also Google Cloud Storage and Azure Blob Storage. See example below.
import pandas as pd
from cloudpathlib import CloudPath
# read data from S3
df = pd.read_csv(CloudPath("s3://covid19-lake/rearc-covid-19-testing-data/csv/states_daily/states_daily.csv"))
# look at some of the data
df.head(1).T.iloc[:10]
#> 0
#> date 20210307
#> state AK
#> positive 56886.0
#> probableCases NaN
#> negative NaN
#> pending NaN
#> totalTestResultsSource totalTestsViral
#> totalTestResults 1731628.0
#> hospitalizedCurrently 33.0
#> hospitalizedCumulative 1293.0
# writing to S3
with CloudPath("s3://bucket-you-can-write-to/data.csv").open("w") as f:
df.to_csv(f)
CloudPath("s3://bucket-you-can-write-to/data.csv").exists()
#> True
Note, that you can't call df.to_csv(CloudPath("s3://drivendata-public-assets/test-asdf2.csv")) directly because of the way pandas handles paths/handles passed to it. Instead you need to open the file for writing and pass that handle directly to to_csv.
This comes with a few added benefits in terms of setting particular options or different authentication mechanisms or keeping a persistent cache so you don't always need to redownload from S3.
from io import StringIO
import boto3
#Creating Session With Boto3.
session = boto3.Session(
aws_access_key_id='<your_access_key_id>',
aws_secret_access_key='<your_secret_access_key>'
)
#Creating S3 Resource From the Session.
s3_res = session.resource('s3')
csv_buffer = StringIO()
df.to_csv(csv_buffer)
bucket_name = 'stackvidhya'
s3_object_name = 'df.csv'
s3_res.Object(bucket_name, s3_object_name).put(Body=csv_buffer.getvalue())
print("Dataframe is saved as CSV in S3 bucket.")
For those who might have problems with S3FS or fsspec using Lambda:
You have to create a layer for each libary and insert them in your Lambda.
You can find how to crate a layer here.
I read a csv with two columns from bucket s3, and the content of the file csv i put in pandas dataframe.
Example:
config.json
{
"credential": {
"access_key":"xxxxxx",
"secret_key":"xxxxxx"
}
,
"s3":{
"bucket":"mybucket",
"key":"csv/user.csv"
}
}
cls_config.json
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import json
class cls_config(object):
def __init__(self,filename):
self.filename = filename
def getConfig(self):
fileName = os.path.join(os.path.dirname(__file__), self.filename)
with open(fileName) as f:
config = json.load(f)
return config
cls_pandas.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import io
class cls_pandas(object):
def __init__(self):
pass
def read(self,stream):
df = pd.read_csv(io.StringIO(stream), sep = ",")
return df
cls_s3.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import boto3
import json
class cls_s3(object):
def __init__(self,access_key,secret_key):
self.s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)
def getObject(self,bucket,key):
read_file = self.s3.get_object(Bucket=bucket, Key=key)
body = read_file['Body'].read().decode('utf-8')
return body
test.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from cls_config import *
from cls_s3 import *
from cls_pandas import *
class test(object):
def __init__(self):
self.conf = cls_config('config.json')
def process(self):
conf = self.conf.getConfig()
bucket = conf['s3']['bucket']
key = conf['s3']['key']
access_key = conf['credential']['access_key']
secret_key = conf['credential']['secret_key']
s3 = cls_s3(access_key,secret_key)
ob = s3.getObject(bucket,key)
pa = cls_pandas()
df = pa.read(ob)
print df
if __name__ == '__main__':
test = test()
test.process()

Categories

Resources