I'm trying to write a pandas dataframe as a pickle file into an s3 bucket in AWS. I know that I can write dataframe new_df as a csv to an s3 bucket as follows:
bucket='mybucket'
key='path'
csv_buffer = StringIO()
s3_resource = boto3.resource('s3')
new_df.to_csv(csv_buffer, index=False)
s3_resource.Object(bucket,path).put(Body=csv_buffer.getvalue())
I've tried using the same code as above with to_pickle() but with no success.
Further to you answer, you don't need to convert to csv.
pickle.dumps method returns a byte obj. see here: https://docs.python.org/3/library/pickle.html
import boto3
import pickle
bucket='your_bucket_name'
key='your_pickle_filename.pkl'
pickle_byte_obj = pickle.dumps([var1, var2, ..., varn])
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket,key).put(Body=pickle_byte_obj)
I've found the solution, need to call BytesIO into the buffer for pickle files instead of StringIO (which are for CSV files).
import io
import boto3
pickle_buffer = io.BytesIO()
s3_resource = boto3.resource('s3')
new_df.to_pickle(pickle_buffer)
s3_resource.Object(bucket, key).put(Body=pickle_buffer.getvalue())
this worked for me with pandas 0.23.4 and boto3 1.7.80 :
bucket='your_bucket_name'
key='your_pickle_filename.pkl'
new_df.to_pickle(key)
s3_resource.Object(bucket, key).put(Body=open(key, 'rb'))
This solution (using s3fs) worked perfectly and elegantly for my team:
import s3fs
from pickle import dump
fs = s3fs.S3FileSystem(anon=False)
bucket = 'bucket1'
key = 'your_pickle_filename.pkl'
dump(data, fs.open(f's3://{bucket}/{key}', 'wb'))
This adds some clarification to a previous answer:
import pandas as pd
import boto3
# make df
df = pd.DataFrame({'col1:': [1,2,3]})
# bucket name
str_bucket = 'bucket_name'
# filename
str_key_file = 'df.pkl'
# bucket path
str_key_bucket = dir_1/dir2/{str_key_file}'
# write df to local pkl file
df.to_pickle(str_key_file)
# put object into s3
boto3.resource('s3').Object(str_bucket, str_key_bucket).put(Body=open(str_key_file, 'rb'))
From the just-released book 'Time Series Analysis with Python' by Tarek Atwan, I learned this method:
import pandas as pd
df = pd.DataFrame(...)
df.to_pickle('s3://mybucket/pklfile.bz2',
storage_options={
'key': AWS_ACCESS_KEY,
'secret': AWS_SECRET_KEY
}
)
which I believe is more pythonic.
I've found the best solution - just upgrade pandas and also install s3fs:
pip install s3fs==2022.8.2
pip install install pandas==1.1.5
bucket,key='mybucket','path'
df.to_pickle(f"{bucket}{key}.pkl.gz", compression='gzip')
Related
My question is inspired by a previous SO about this topic: uploading and saving DataFrames as csv files in Amazon Web Services (AWS) S3. Using Python3, I would like to use s3.upload_fileobj – multi-part uploads – to make the data transfer to S3 faster. When I run the code in the accepted answer, I receive an error message : "TypeError: a bytes-like object is required, not 'str' ". .
The answer has recently been upvoted several times. So I think there must be a way to run this code without error in Python3.
Please find below the code. Let's for ease use a simple DataFrame. In reality this DataFrame is much bigger (at around 500 MB).
import pandas as pd
import io
df = pd.DataFrame({'A':[1,2,3], 'B':[6,7,8]})
The code is the following. I turned it for convenience in a function :
def upload_file(dataframe, bucket, key):
"""dat=DataFrame, bucket=bucket name in AWS S3, key=key name in AWS S3"""
s3 = boto3.client('s3')
csv_buffer = io.BytesIO()
dataframe.to_csv(csv_buffer, compression='gzip')
s3.upload_fileobj(csv_buffer, bucket, key)
upload_file(df, your-bucket, your-key)
Thank you very much for your advices!
Going off this reference, it seems you'll need to wrap a gzip.GzipFile object around your BytesIO which will then perform the compression for you.
import io
import gzip
buffer = io.BytesIO()
with gzip.GzipFile(fileobj=buffer, mode="wb") as f:
f.write(df.to_csv().encode())
buffer.seek(0)
s3.upload_fileobj(buffer, bucket, key)
Minimal Verifiable Example
import io
import gzip
import zlib
# Encode
df = pd.DataFrame({'A':[1,2,3], 'B':[6,7,8]})
buffer = io.BytesIO()
with gzip.GzipFile(fileobj=buffer, mode="wb") as f:
f.write(df.to_csv().encode())
buffer.getvalue()
# b'\x1f\x8b\x08\x00\xf0\x0b\x11]\x02\xff\xd3q\xd4q\xe22\xd01\xd41\xe32\xd41\xd21\xe72\xd21\xd6\xb1\xe0\x02\x00Td\xc2\xf5\x17\x00\x00\x00'
# Decode
print(zlib.decompress(out.getvalue(), 16+zlib.MAX_WBITS).decode())
# ,A,B
# 0,1,6
# 1,2,7
# 2,3,8
The only you need is a TextIOWrapper, as to_csv expects a string while upload_fileobj expects bytes
def upload_file(dataframe, bucket, key):
"""dat=DataFrame, bucket=bucket name in AWS S3, key=key name in AWS S3"""
s3 = boto3.client('s3')
csv_buffer = io.BytesIO()
w = io.TextIOWrapper(csv_buffer)
dataframe.to_csv(w, compression='gzip')
w.seek(0)
s3.upload_fileobj(csv_buffer, bucket, key)
And the code uploads fine
$ cat test.csv
,A,B
0,1,6
1,2,7
2,3,8
You could try something like this.
import pandas as pd
import io
df = pd.DataFrame({'A':[1,2,3], 'B':[6,7,8]})
def upload_file(dataframe, bucket, key):
"""dat=DataFrame, bucket=bucket name in AWS S3, key=key name in AWS S3"""
s3 = boto3.client('s3')
csv_buffer = io.StringIO()
dataframe.to_csv(csv_buffer, compression='gzip')
csv_buffer.seek(0)
s3.upload_fileobj(csv_buffer, bucket, key)
upload_file(df, your-bucket, your-key)
I'm having a Pandas dataframe which I'm trying to save as parquet file into S3:
dftest = pd.DataFrame({'field': [1,2,3]})
dftest.to_parquet("s3://bucket_name/test.parquet", engine='pyarrow',
compression='gzip')
I'm getting : "FileNotFoundError: bucket_name/test.parquet"
Although I still couldn't make pandas.DataFrame.to_parquet approach to work with S3, I did find different solution which seems to work:
import s3fs
from fastparquet import write
s3 = s3fs.S3FileSystem()
myopen = s3.open
write('s3://bucketname/test.parquet', dftest, compression='GZIP', open_with=myopen)
I have several CSV files (50 GB) in an S3 bucket in Amazon Cloud. I am trying to read these files in a Jupyter Notebook (with Python3 Kernel) using the following code:
import boto3
from boto3 import session
import pandas as pd
session = boto3.session.Session(region_name='XXXX')
s3client = session.client('s3', config = boto3.session.Config(signature_version='XXXX'))
response = s3client.get_object(Bucket='myBucket', Key='myKey')
names = ['id','origin','name']
dataset = pd.read_csv(response['Body'], names=names)
dataset.head()
But I face the following error when I run the code:
valueError: Invalid file path or buffer object type: class 'botocore.response.StreamingBody'
I came across this bug report about pandas and boto3 object not being compatible yet.
My question is, how else can I import these CSV files from my S3 bucket into my Jupyter Notebook which runs on the Cloud.
You can also use s3fs which allows pandas to read directly from S3:
import s3fs
# csv file
df = pd.read_csv('s3://{bucket_name}/{path_to_file}')
# parquet file
df = pd.read_parquet('s3://{bucket_name}/{path_to_file}')
And then if you have multiple files in a bucket, you can iterate through them like so:
import boto3
s3_resource = boto3.resource('s3')
bucket = s3_resource.Bucket(name='{bucket_name}')
for file in bucket.objects.all():
# do what you want with the files
# for example:
if 'filter' in file.key:
print(file.key)
new_df = pd.read_csv('s3:://{bucket_name}/{}'.format(file.key))
I am posting this fix to my problem, in case somebody needs it. I replaces the read_csv line with the following and the problem was solved:
dataset = pd.read_csv(io.BytesIO(response['Body'].read()), encoding='utf8')
I have a hacky way of achieving this using boto3 (1.4.4), pyarrow (0.4.1) and pandas (0.20.3).
First, I can read a single parquet file locally like this:
import pyarrow.parquet as pq
path = 'parquet/part-r-00000-1e638be4-e31f-498a-a359-47d017a0059c.gz.parquet'
table = pq.read_table(path)
df = table.to_pandas()
I can also read a directory of parquet files locally like this:
import pyarrow.parquet as pq
dataset = pq.ParquetDataset('parquet/')
table = dataset.read()
df = table.to_pandas()
Both work like a charm. Now I want to achieve the same remotely with files stored in a S3 bucket. I was hoping that something like this would work:
dataset = pq.ParquetDataset('s3n://dsn/to/my/bucket')
But it does not:
OSError: Passed non-file path: s3n://dsn/to/my/bucket
After reading pyarrow's documentation thoroughly, this does not seem possible at the moment. So I came out with the following solution:
Reading a single file from S3 and getting a pandas dataframe:
import io
import boto3
import pyarrow.parquet as pq
buffer = io.BytesIO()
s3 = boto3.resource('s3')
s3_object = s3.Object('bucket-name', 'key/to/parquet/file.gz.parquet')
s3_object.download_fileobj(buffer)
table = pq.read_table(buffer)
df = table.to_pandas()
And here my hacky, not-so-optimized, solution to create a pandas dataframe from a S3 folder path:
import io
import boto3
import pandas as pd
import pyarrow.parquet as pq
bucket_name = 'bucket-name'
def download_s3_parquet_file(s3, bucket, key):
buffer = io.BytesIO()
s3.Object(bucket, key).download_fileobj(buffer)
return buffer
client = boto3.client('s3')
s3 = boto3.resource('s3')
objects_dict = client.list_objects_v2(Bucket=bucket_name, Prefix='my/folder/prefix')
s3_keys = [item['Key'] for item in objects_dict['Contents'] if item['Key'].endswith('.parquet')]
buffers = [download_s3_parquet_file(s3, bucket_name, key) for key in s3_keys]
dfs = [pq.read_table(buffer).to_pandas() for buffer in buffers]
df = pd.concat(dfs, ignore_index=True)
Is there a better way to achieve this? Maybe some kind of connector for pandas using pyarrow? I would like to avoid using pyspark, but if there is no other solution, then I would take it.
You should use the s3fs module as proposed by yjk21. However as result of calling ParquetDataset you'll get a pyarrow.parquet.ParquetDataset object. To get the Pandas DataFrame you'll rather want to apply .read_pandas().to_pandas() to it:
import pyarrow.parquet as pq
import s3fs
s3 = s3fs.S3FileSystem()
pandas_dataframe = pq.ParquetDataset('s3://your-bucket/', filesystem=s3).read_pandas().to_pandas()
Thanks! Your question actually tell me a lot. This is how I do it now with pandas (0.21.1), which will call pyarrow, and boto3 (1.3.1).
import boto3
import io
import pandas as pd
# Read single parquet file from S3
def pd_read_s3_parquet(key, bucket, s3_client=None, **args):
if s3_client is None:
s3_client = boto3.client('s3')
obj = s3_client.get_object(Bucket=bucket, Key=key)
return pd.read_parquet(io.BytesIO(obj['Body'].read()), **args)
# Read multiple parquets from a folder on S3 generated by spark
def pd_read_s3_multiple_parquets(filepath, bucket, s3=None,
s3_client=None, verbose=False, **args):
if not filepath.endswith('/'):
filepath = filepath + '/' # Add '/' to the end
if s3_client is None:
s3_client = boto3.client('s3')
if s3 is None:
s3 = boto3.resource('s3')
s3_keys = [item.key for item in s3.Bucket(bucket).objects.filter(Prefix=filepath)
if item.key.endswith('.parquet')]
if not s3_keys:
print('No parquet found in', bucket, filepath)
elif verbose:
print('Load parquets:')
for p in s3_keys:
print(p)
dfs = [pd_read_s3_parquet(key, bucket=bucket, s3_client=s3_client, **args)
for key in s3_keys]
return pd.concat(dfs, ignore_index=True)
Then you can read multiple parquets under a folder from S3 by
df = pd_read_s3_multiple_parquets('path/to/folder', 'my_bucket')
(One can simplify this code a lot I guess.)
It can be done using boto3 as well without the use of pyarrow
import boto3
import io
import pandas as pd
# Read the parquet file
buffer = io.BytesIO()
s3 = boto3.resource('s3')
object = s3.Object('bucket_name','key')
object.download_fileobj(buffer)
df = pd.read_parquet(buffer)
print(df.head())
Probably the easiest way to read parquet data on the cloud into dataframes is to use dask.dataframe in this way:
import dask.dataframe as dd
df = dd.read_parquet('s3://bucket/path/to/data-*.parq')
dask.dataframe can read from Google Cloud Storage, Amazon S3, Hadoop file system and more!
Provided you have the right package setup
$ pip install pandas==1.1.0 pyarrow==1.0.0 s3fs==0.4.2
and your AWS shared config and credentials files configured appropriately
you can use pandas right away:
import pandas as pd
df = pd.read_parquet("s3://bucket/key.parquet")
In case of having multiple AWS profiles you may also need to set
$ export AWS_DEFAULT_PROFILE=profile_under_which_the_bucket_is_accessible
so you can access your bucket.
If you are open to also use AWS Data Wrangler.
import awswrangler as wr
df = wr.s3.read_parquet(path="s3://...")
You can use s3fs from dask which implements a filesystem interface for s3. Then you can use the filesystem argument of ParquetDataset like so:
import s3fs
s3 = s3fs.S3FileSystem()
dataset = pq.ParquetDataset('s3n://dsn/to/my/bucket', filesystem=s3)
Using pre-signed URLs
s3 =s3fs.S3FileSystem(key='your_key',secret='your_secret',client_kwargs={"endpoint_url":'your_end_point'})
df = dd.read_parquet(s3.url('your_bucket' + 'your_filepath',expires=3600,client_method='get_object'))
I have tried the #oya163 solution and it works but after little bit change
import boto3
import io
import pandas as pd
# Read the parquet file
buffer = io.BytesIO()
s3 = boto3.resource('s3',aws_access_key_id='123',aws_secret_access_key= '456')
object = s3.Object('bucket_name','myoutput.parquet')
object.download_fileobj(buffer)
df = pd.read_parquet(buffer)
print(df.head())
I have a pandas DataFrame that I want to upload to a new CSV file. The problem is that I don't want to save the file locally before transferring it to s3. Is there any method like to_csv for writing the dataframe to s3 directly? I am using boto3.
Here is what I have so far:
import boto3
s3 = boto3.client('s3', aws_access_key_id='key', aws_secret_access_key='secret_key')
read_file = s3.get_object(Bucket, Key)
df = pd.read_csv(read_file['Body'])
# Make alterations to DataFrame
# Then export DataFrame to CSV through direct transfer to s3
You can use:
from io import StringIO # python3; python2: BytesIO
import boto3
bucket = 'my_bucket_name' # already created on S3
csv_buffer = StringIO()
df.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'df.csv').put(Body=csv_buffer.getvalue())
You can directly use the S3 path. I am using Pandas 0.24.1
In [1]: import pandas as pd
In [2]: df = pd.DataFrame( [ [1, 1, 1], [2, 2, 2] ], columns=['a', 'b', 'c'])
In [3]: df
Out[3]:
a b c
0 1 1 1
1 2 2 2
In [4]: df.to_csv('s3://experimental/playground/temp_csv/dummy.csv', index=False)
In [5]: pd.__version__
Out[5]: '0.24.1'
In [6]: new_df = pd.read_csv('s3://experimental/playground/temp_csv/dummy.csv')
In [7]: new_df
Out[7]:
a b c
0 1 1 1
1 2 2 2
Release Note:
S3 File Handling
pandas now uses s3fs for handling S3 connections. This shouldn’t break any code. However, since s3fs is not a required dependency, you will need to install it separately, like boto in prior versions of pandas. GH11915.
I like s3fs which lets you use s3 (almost) like a local filesystem.
You can do this:
import s3fs
bytes_to_write = df.to_csv(None).encode()
fs = s3fs.S3FileSystem(key=key, secret=secret)
with fs.open('s3://bucket/path/to/file.csv', 'wb') as f:
f.write(bytes_to_write)
s3fs supports only rb and wb modes of opening the file, that's why I did this bytes_to_write stuff.
This is a more up to date answer:
import s3fs
s3 = s3fs.S3FileSystem(anon=False)
# Use 'w' for py3, 'wb' for py2
with s3.open('<bucket-name>/<filename>.csv','w') as f:
df.to_csv(f)
The problem with StringIO is that it will eat away at your memory. With this method, you are streaming the file to s3, rather than converting it to string, then writing it into s3. Holding the pandas dataframe and its string copy in memory seems very inefficient.
If you are working in an ec2 instant, you can give it an IAM role to enable writing it to s3, thus you dont need to pass in credentials directly. However, you can also connect to a bucket by passing credentials to the S3FileSystem() function. See documention:https://s3fs.readthedocs.io/en/latest/
If you pass None as the first argument to to_csv() the data will be returned as a string. From there it's an easy step to upload that to S3 in one go.
It should also be possible to pass a StringIO object to to_csv(), but using a string will be easier.
You can also use the AWS Data Wrangler:
import awswrangler as wr
wr.s3.to_csv(
df=df,
path="s3://...",
)
Note that it will handle multipart upload for you to make the upload faster.
I found this can be done using client also and not just resource.
from io import StringIO
import boto3
s3 = boto3.client("s3",\
region_name=region_name,\
aws_access_key_id=aws_access_key_id,\
aws_secret_access_key=aws_secret_access_key)
csv_buf = StringIO()
df.to_csv(csv_buf, header=True, index=False)
csv_buf.seek(0)
s3.put_object(Bucket=bucket, Body=csv_buf.getvalue(), Key='path/test.csv')
I use AWS Data Wrangler. For example:
import awswrangler as wr
import pandas as pd
# read a local dataframe
df = pd.read_parquet('my_local_file.gz')
# upload to S3 bucket
wr.s3.to_parquet(df=df, path='s3://mys3bucket/file_name.gz')
The same applies to csv files. Instead of read_parquet and to_parquet, use read_csv and to_csv with the proper file extension.
since you are using boto3.client(), try:
import boto3
from io import StringIO #python3
s3 = boto3.client('s3', aws_access_key_id='key', aws_secret_access_key='secret_key')
def copy_to_s3(client, df, bucket, filepath):
csv_buf = StringIO()
df.to_csv(csv_buf, header=True, index=False)
csv_buf.seek(0)
client.put_object(Bucket=bucket, Body=csv_buf.getvalue(), Key=filepath)
print(f'Copy {df.shape[0]} rows to S3 Bucket {bucket} at {filepath}, Done!')
copy_to_s3(client=s3, df=df_to_upload, bucket='abc', filepath='def/test.csv')
You can use
pandas
boto3
s3fs (version ≤0.4)
I use to_csv with s3:// in path and storage_options
key = "folder/file.csv"
df.to_csv(
f"s3://{YOUR_S3_BUCKET}/{key}",
index=False,
storage_options={
"key": AWS_ACCESS_KEY_ID,
"secret": AWS_SECRET_ACCESS_KEY,
"token": AWS_SESSION_TOKEN,
},
To handle large files efficiently you can also use an open-source S3-compatible MinIO, with its minio python client package, like in this function of mine:
import minio
import os
import pandas as pd
minio_client = minio.Minio(..)
def write_df_to_minio(df,
minio_client,
bucket_name,
file_name="new-file.csv",
local_temp_folder="/tmp/",
content_type="application/csv",
sep=",",
save_row_index=False):
df.to_csv(os.path.join(local_temp_folder, file_name), sep=sep, index=save_row_index)
minio_results = minio_client.fput_object(bucket_name=bucket_name,
object_name=file_name,
file_path=os.path.join(local_temp_folder, file_name),
content_type=content_type)
assert minio_results.object_name == file_name
Another option is to do this with cloudpathlib, which supports S3 and also Google Cloud Storage and Azure Blob Storage. See example below.
import pandas as pd
from cloudpathlib import CloudPath
# read data from S3
df = pd.read_csv(CloudPath("s3://covid19-lake/rearc-covid-19-testing-data/csv/states_daily/states_daily.csv"))
# look at some of the data
df.head(1).T.iloc[:10]
#> 0
#> date 20210307
#> state AK
#> positive 56886.0
#> probableCases NaN
#> negative NaN
#> pending NaN
#> totalTestResultsSource totalTestsViral
#> totalTestResults 1731628.0
#> hospitalizedCurrently 33.0
#> hospitalizedCumulative 1293.0
# writing to S3
with CloudPath("s3://bucket-you-can-write-to/data.csv").open("w") as f:
df.to_csv(f)
CloudPath("s3://bucket-you-can-write-to/data.csv").exists()
#> True
Note, that you can't call df.to_csv(CloudPath("s3://drivendata-public-assets/test-asdf2.csv")) directly because of the way pandas handles paths/handles passed to it. Instead you need to open the file for writing and pass that handle directly to to_csv.
This comes with a few added benefits in terms of setting particular options or different authentication mechanisms or keeping a persistent cache so you don't always need to redownload from S3.
from io import StringIO
import boto3
#Creating Session With Boto3.
session = boto3.Session(
aws_access_key_id='<your_access_key_id>',
aws_secret_access_key='<your_secret_access_key>'
)
#Creating S3 Resource From the Session.
s3_res = session.resource('s3')
csv_buffer = StringIO()
df.to_csv(csv_buffer)
bucket_name = 'stackvidhya'
s3_object_name = 'df.csv'
s3_res.Object(bucket_name, s3_object_name).put(Body=csv_buffer.getvalue())
print("Dataframe is saved as CSV in S3 bucket.")
For those who might have problems with S3FS or fsspec using Lambda:
You have to create a layer for each libary and insert them in your Lambda.
You can find how to crate a layer here.
I read a csv with two columns from bucket s3, and the content of the file csv i put in pandas dataframe.
Example:
config.json
{
"credential": {
"access_key":"xxxxxx",
"secret_key":"xxxxxx"
}
,
"s3":{
"bucket":"mybucket",
"key":"csv/user.csv"
}
}
cls_config.json
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import json
class cls_config(object):
def __init__(self,filename):
self.filename = filename
def getConfig(self):
fileName = os.path.join(os.path.dirname(__file__), self.filename)
with open(fileName) as f:
config = json.load(f)
return config
cls_pandas.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import io
class cls_pandas(object):
def __init__(self):
pass
def read(self,stream):
df = pd.read_csv(io.StringIO(stream), sep = ",")
return df
cls_s3.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import boto3
import json
class cls_s3(object):
def __init__(self,access_key,secret_key):
self.s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)
def getObject(self,bucket,key):
read_file = self.s3.get_object(Bucket=bucket, Key=key)
body = read_file['Body'].read().decode('utf-8')
return body
test.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from cls_config import *
from cls_s3 import *
from cls_pandas import *
class test(object):
def __init__(self):
self.conf = cls_config('config.json')
def process(self):
conf = self.conf.getConfig()
bucket = conf['s3']['bucket']
key = conf['s3']['key']
access_key = conf['credential']['access_key']
secret_key = conf['credential']['secret_key']
s3 = cls_s3(access_key,secret_key)
ob = s3.getObject(bucket,key)
pa = cls_pandas()
df = pa.read(ob)
print df
if __name__ == '__main__':
test = test()
test.process()