Import multiple CSV files from Google Cloud Bucket to Datalab - python

I'm trying to get the below code to import multiple csv files (ALLOWANCE1.csv and ALLOWANCE2.csv) from a Google Cloud Bucket to Datalab in python 2.x:
import numpy as np
import pandas as pd
from google.datalab import Context
import google.datalab.bigquery as bq
import google.datalab.storage as storage
from io import BytesIO
myBucket = storage.Bucket('Bucket Name')
object_list = myBucket.objects(prefix='ALLOWANCE')
df_list = []
for obj in object_list:
%gcs read --object $obj.uri --variable data
df_list.append(pd.read_csv(BytesIO(data)))
concatenated_df = pd.concat(df_list, ignore_index=True)
concatenated_df.head()
I'm getting the following error right at the beginning of the for loop:
RequestExceptionTraceback (most recent call last)
<ipython-input-5-3188aab389b8> in <module>()
----> 1 for obj in object_list:
2 get_ipython().magic(u'gcs read --object $obj.uri --variable
data')
3 df_list.append(pd.read_csv(BytesIO(data)))
/usr/local/envs/py2env/lib/python2.7/site-
packages/google/datalab/utils/_iterator.pyc in __iter__(self)
34 """Provides iterator functionality."""
35 while self._first_page or (self._page_token is not None):
---> 36 items, next_page_token = self._retriever(self._page_token, self._count)
37
38 self._page_token = next_page_token
/usr/local/envs/py2env/lib/python2.7/site-packages/google/datalab/storage/_object.pyc in _retrieve_objects(self, page_token, _)
319 page_token=page_token)
320 except Exception as e:
--> 321 raise e
322
323 objects = list_info.get('items', [])
RequestException: HTTP request failed: Not Found
I have spent some time resolving this issue but no luck! Any help would be greatly appreciated!

I don't think you can mix the notebook shell commands with python variables. Perhaps try using the subprocess python lib and call the commandline commands using python.
import numpy as np
import pandas as pd
from google.datalab import Context
import google.datalab.bigquery as bq
import google.datalab.storage as storage
from io import BytesIO
#new line
from subprocess import call
from google.colab import auth #new lines
auth.authenticate_user()
myBucket = storage.Bucket('Bucket Name')
object_list = myBucket.objects(prefix='ALLOWANCE')
df_list = []
for obj in object_list:
call(['gsutil', 'cp', obj.uri, '/tmp/']) #first copy file
filename = obj.uri.split('/')[-1] #get file name
df_list.append(pd.read_csv('/tmp/' + filename))
concatenated_df = pd.concat(df_list, ignore_index=True)
concatenated_df.head()
note that I did not run this code but have run "call" with my own files successfully. Another suggestion is to first run the file copy calls in one loop prior to reading them. That way, if you iterate a lot with your data you're not re-downloading them each time.

Related

Getting _register_s3_control_events() Error

I am trying to read a csv from an S3 bucket using my jupyter notebook. I have previously read this csv before and had no issues but now am receiving an error.
Here is the code I am running:
import pandas as pd
list = pd.read_csv(r's3://analytics/wordlist.csv')
And the error I am getting is:
An error was encountered:
_register_s3_control_events() takes 2 positional arguments but 6 were given
I thought it may be the S3 bucket permissioning but it is public to my organization and so shouldn't be the issue.
Any ideas what might be wrong?
You could use boto to import the csv from s3. Boto is a python library for AWS.
By the way, this should work:
import boto
import pandas as pd
data = pd.read_csv('s3://bucket....csv')
If you are on python 3.4+, you need:
import boto3
import io
import pandas as pd
s3 = boto3.client('s3')
obj = s3.get_object(Bucket='bucket', Key='key')
df = pd.read_csv(io.BytesIO(obj['Body'].read()))

Read an object from Alibaba OSS and modify it using pandas python

So, my data is in the format of CSV files in the OSS bucket of Alibaba Cloud.
I am currently executing a Python script, wherein:
I download the file into my local machine.
Do the changes using Python script in my local machine.
Store it in AWS Cloud.
I have to modify this method and schedule a cron job in Alibaba Cloud to automate the running of this script.
The Python script will be uploaded into Task Management of Alibaba Cloud.
So the new steps will be:
Read a file from the OSS bucket into Pandas.
Modify it - Merging it with other data, some column changes. - Will be done in pandas.
Store the modified file into AWS RDS.
I am stuck at the first step itself.
Error Log:
"No module found" for OSS2 & pandas.
What is the correct way of doing it?
This is a rough draft of my script (on how was able to execute script in my local machine):
import os,re
import oss2 -- **throws an error. No module found.**
import datetime as dt
import pandas as pd -- **throws an error. No module found.**
import tarfile
import mysql.connector
from datetime import datetime
from itertools import islice
dates = (dt.datetime.now()+dt.timedelta(days=-1)).strftime("%Y%m%d")
def download_file(access_key_id,access_key_secret,endpoint,bucket):
#Authentication
auth = oss2.Auth(access_key_id, access_key_secret)
# Bucket name
bucket = oss2.Bucket(auth, endpoint, bucket)
# Download the file
try:
# List all objects in the fun folder and its subfolders.
for obj in oss2.ObjectIterator(bucket, prefix=dates+'order'):
order_file = obj.key
objectName = order_file.split('/')[1]
df = pd.read_csv(bucket.get_object(order_file)) # to read into pandas
# FUNCTION to modify and upload
print("File downloaded")
except:
print("Pls check!!! File not read")
return objectName
import os,re
import oss2
import datetime as dt
import pandas as pd
import tarfile
import mysql.connector
from datetime import datetime
from itertools import islice
import io ## include this new library
dates = (dt.datetime.now()+dt.timedelta(days=-1)).strftime("%Y%m%d")
def download_file(access_key_id,access_key_secret,endpoint,bucket):
#Authentication
auth = oss2.Auth(access_key_id, access_key_secret)
# Bucket name
bucket = oss2.Bucket(auth, endpoint, bucket)
# Download the file
try:
# List all objects in the fun folder and its subfolders.
for obj in oss2.ObjectIterator(bucket, prefix=dates+'order'):
order_file = obj.key
objectName = order_file.split('/')[1]
bucket_object = bucket.get_object(order_file).read() ## read the file from OSS
img_buf = io.BytesIO(bucket_object))
df = pd.read_csv(img_buf) # to read into pandas
# FUNCTION to modify and upload
print("File downloaded")
except:
print("Pls check!!! File not read")
return objectName

Unpickling a file using os.path

I'm getting a weird error when running the following code:
import os
import urllib.request
import pandas as pd
data_url = 'URL FROM GOOGLE DRIVE DOWNLOAD'
file_name = 'mask.pkl'
data_dir = os.path.join('tempdata', 'test')
file_path = os.path.join(data_dir, file_name)
gdrive_file(file_path, data_url, data_dir)
x = pd.read_pickle(file_path)
mask = x[0]
Where the data_url is a download link from Google Drive, but it's just a .pkl file, any .pkl you use to test this should throw the same error. The gdrive_file() function is defined as follows:
def gdrive_file(file_path, data_url, data_dir):
if file_path is True:
pass
if not os.path.isfile(file_path):
print('Fetching example data file')
os.makedirs(data_dir, exist_ok=True)
return urllib.request.urlretrieve(data_url, file_path)
Everything works great up to the point where I use pandas to read the .pkl file. I get the following error:
In [11]: pd.read_pickle(file_path)
#---------------------------------------------
UnpicklingError
Traceback (most recent call last)
<ipython-input-11-f996be11e8eb> in <module>
----> 1 pd.read_pickle(file_path)
~/anaconda3/envs/reborn/lib/python3.7/site-packages/pandas/io/pickle.py in read_pickle(filepath_or_buffer, compression)
180 # We want to silence any warnings about, e.g. moved modules.
181 warnings.simplefilter("ignore", Warning)
--> 182 return pickle.load(f)
183 except excs_to_catch:
184 # e.g.
UnpicklingError: invalid load key, '<'.
I've used this same code to open other data types, it's just having an issue with opening .pkl file using pd.read_pickle(file_path) and I'm not sure why.

How to read a defined list of parquet files from s3 using PyArrow?

I need to incrementally load data to Pandas from Parquet files stored in s3, i'm trying to use PyArrow for this but not having any luck.
Writing an entire directory of Parquet files into Pandas works just fine:
import s3fs
import pyarrow.parquet as pq
import pandas as pd
fs = s3fs.S3FileSystem(mykey,mysecret)
p_dataset = pq.ParquetDataset('s3://mys3bucket/directory', filesystem=fs)
df = p_dataset.read().to_pandas()
But when I try to load a single Parquet file I get an error:
fs = s3fs.S3FileSystem(mykey,mysecret)
p_dataset = pq.ParquetDataset('s3://mys3bucket/directory/1_0_00000000000000014012'
, filesystem=fs)
df = p_dataset.read().to_pandas()
Throws error:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-179-3d01b32c60f7> in <module>()
15 p_dataset = pq.ParquetDataset(
16 's3://mys3bucket/directory/1_0_00000000000000014012',
---> 17 filesystem=fs)
18
19 table2.to_pandas()
C:\User\Anaconda3\lib\site-packages\pyarrow\parquet.py in __init__(self, path_or_paths, filesystem, schema, metadata, split_row_groups, validate_schema, filters, metadata_nthreads)
880
881 if validate_schema:
--> 882 self.validate_schemas()
883
884 if filters is not None:
C:\User\Anaconda3\lib\site-packages\pyarrow\parquet.py in validate_schemas(self)
893 self.schema = self.common_metadata.schema
894 else:
--> 895 self.schema = self.pieces[0].get_metadata(open_file).schema
896 elif self.schema is None:
897 self.schema = self.metadata.schema
IndexError: list index out of range
Would appreciate any help with this error.
Ideally I need to append all new data added to s3 (added since the previous time I ran this script) to the Pandas dataframe so was thinking I pass a list of filenames to ParquetDataset. Is there a better way to achieve this? Thanks
You want to use pq.read_table (pass a file path or file handle) instead of pq.ParquetDataset (pass a directory). HTH
For python 3.6+ AWS has a library called aws-data-wrangler that helps with the integration between Pandas/S3/Parquet
to install do;
pip install awswrangler
to read a single parquet file from s3 using awswrangler 1.x.x and above, do;
import awswrangler as wr
df = wr.s3.read_parquet(path="s3://my_bucket/path/to/data_folder/my-file.parquet")
to read a list of parquet files, do;
import awswrangler as wr
df = wr.s3.read_parquet(path="s3://my_bucket/path/to/data_folder/", dataset=True)
By setting dataset=True awswrangler will read all the individual parquet files below the s3 key.

Save Dataframe to csv directly to s3 Python

I have a pandas DataFrame that I want to upload to a new CSV file. The problem is that I don't want to save the file locally before transferring it to s3. Is there any method like to_csv for writing the dataframe to s3 directly? I am using boto3.
Here is what I have so far:
import boto3
s3 = boto3.client('s3', aws_access_key_id='key', aws_secret_access_key='secret_key')
read_file = s3.get_object(Bucket, Key)
df = pd.read_csv(read_file['Body'])
# Make alterations to DataFrame
# Then export DataFrame to CSV through direct transfer to s3
You can use:
from io import StringIO # python3; python2: BytesIO
import boto3
bucket = 'my_bucket_name' # already created on S3
csv_buffer = StringIO()
df.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'df.csv').put(Body=csv_buffer.getvalue())
You can directly use the S3 path. I am using Pandas 0.24.1
In [1]: import pandas as pd
In [2]: df = pd.DataFrame( [ [1, 1, 1], [2, 2, 2] ], columns=['a', 'b', 'c'])
In [3]: df
Out[3]:
a b c
0 1 1 1
1 2 2 2
In [4]: df.to_csv('s3://experimental/playground/temp_csv/dummy.csv', index=False)
In [5]: pd.__version__
Out[5]: '0.24.1'
In [6]: new_df = pd.read_csv('s3://experimental/playground/temp_csv/dummy.csv')
In [7]: new_df
Out[7]:
a b c
0 1 1 1
1 2 2 2
Release Note:
S3 File Handling
pandas now uses s3fs for handling S3 connections. This shouldn’t break any code. However, since s3fs is not a required dependency, you will need to install it separately, like boto in prior versions of pandas. GH11915.
I like s3fs which lets you use s3 (almost) like a local filesystem.
You can do this:
import s3fs
bytes_to_write = df.to_csv(None).encode()
fs = s3fs.S3FileSystem(key=key, secret=secret)
with fs.open('s3://bucket/path/to/file.csv', 'wb') as f:
f.write(bytes_to_write)
s3fs supports only rb and wb modes of opening the file, that's why I did this bytes_to_write stuff.
This is a more up to date answer:
import s3fs
s3 = s3fs.S3FileSystem(anon=False)
# Use 'w' for py3, 'wb' for py2
with s3.open('<bucket-name>/<filename>.csv','w') as f:
df.to_csv(f)
The problem with StringIO is that it will eat away at your memory. With this method, you are streaming the file to s3, rather than converting it to string, then writing it into s3. Holding the pandas dataframe and its string copy in memory seems very inefficient.
If you are working in an ec2 instant, you can give it an IAM role to enable writing it to s3, thus you dont need to pass in credentials directly. However, you can also connect to a bucket by passing credentials to the S3FileSystem() function. See documention:https://s3fs.readthedocs.io/en/latest/
If you pass None as the first argument to to_csv() the data will be returned as a string. From there it's an easy step to upload that to S3 in one go.
It should also be possible to pass a StringIO object to to_csv(), but using a string will be easier.
You can also use the AWS Data Wrangler:
import awswrangler as wr
wr.s3.to_csv(
df=df,
path="s3://...",
)
Note that it will handle multipart upload for you to make the upload faster.
I found this can be done using client also and not just resource.
from io import StringIO
import boto3
s3 = boto3.client("s3",\
region_name=region_name,\
aws_access_key_id=aws_access_key_id,\
aws_secret_access_key=aws_secret_access_key)
csv_buf = StringIO()
df.to_csv(csv_buf, header=True, index=False)
csv_buf.seek(0)
s3.put_object(Bucket=bucket, Body=csv_buf.getvalue(), Key='path/test.csv')
I use AWS Data Wrangler. For example:
import awswrangler as wr
import pandas as pd
# read a local dataframe
df = pd.read_parquet('my_local_file.gz')
# upload to S3 bucket
wr.s3.to_parquet(df=df, path='s3://mys3bucket/file_name.gz')
The same applies to csv files. Instead of read_parquet and to_parquet, use read_csv and to_csv with the proper file extension.
since you are using boto3.client(), try:
import boto3
from io import StringIO #python3
s3 = boto3.client('s3', aws_access_key_id='key', aws_secret_access_key='secret_key')
def copy_to_s3(client, df, bucket, filepath):
csv_buf = StringIO()
df.to_csv(csv_buf, header=True, index=False)
csv_buf.seek(0)
client.put_object(Bucket=bucket, Body=csv_buf.getvalue(), Key=filepath)
print(f'Copy {df.shape[0]} rows to S3 Bucket {bucket} at {filepath}, Done!')
copy_to_s3(client=s3, df=df_to_upload, bucket='abc', filepath='def/test.csv')
You can use
pandas
boto3
s3fs (version ≤0.4)
I use to_csv with s3:// in path and storage_options
key = "folder/file.csv"
df.to_csv(
f"s3://{YOUR_S3_BUCKET}/{key}",
index=False,
storage_options={
"key": AWS_ACCESS_KEY_ID,
"secret": AWS_SECRET_ACCESS_KEY,
"token": AWS_SESSION_TOKEN,
},
To handle large files efficiently you can also use an open-source S3-compatible MinIO, with its minio python client package, like in this function of mine:
import minio
import os
import pandas as pd
minio_client = minio.Minio(..)
def write_df_to_minio(df,
minio_client,
bucket_name,
file_name="new-file.csv",
local_temp_folder="/tmp/",
content_type="application/csv",
sep=",",
save_row_index=False):
df.to_csv(os.path.join(local_temp_folder, file_name), sep=sep, index=save_row_index)
minio_results = minio_client.fput_object(bucket_name=bucket_name,
object_name=file_name,
file_path=os.path.join(local_temp_folder, file_name),
content_type=content_type)
assert minio_results.object_name == file_name
Another option is to do this with cloudpathlib, which supports S3 and also Google Cloud Storage and Azure Blob Storage. See example below.
import pandas as pd
from cloudpathlib import CloudPath
# read data from S3
df = pd.read_csv(CloudPath("s3://covid19-lake/rearc-covid-19-testing-data/csv/states_daily/states_daily.csv"))
# look at some of the data
df.head(1).T.iloc[:10]
#> 0
#> date 20210307
#> state AK
#> positive 56886.0
#> probableCases NaN
#> negative NaN
#> pending NaN
#> totalTestResultsSource totalTestsViral
#> totalTestResults 1731628.0
#> hospitalizedCurrently 33.0
#> hospitalizedCumulative 1293.0
# writing to S3
with CloudPath("s3://bucket-you-can-write-to/data.csv").open("w") as f:
df.to_csv(f)
CloudPath("s3://bucket-you-can-write-to/data.csv").exists()
#> True
Note, that you can't call df.to_csv(CloudPath("s3://drivendata-public-assets/test-asdf2.csv")) directly because of the way pandas handles paths/handles passed to it. Instead you need to open the file for writing and pass that handle directly to to_csv.
This comes with a few added benefits in terms of setting particular options or different authentication mechanisms or keeping a persistent cache so you don't always need to redownload from S3.
from io import StringIO
import boto3
#Creating Session With Boto3.
session = boto3.Session(
aws_access_key_id='<your_access_key_id>',
aws_secret_access_key='<your_secret_access_key>'
)
#Creating S3 Resource From the Session.
s3_res = session.resource('s3')
csv_buffer = StringIO()
df.to_csv(csv_buffer)
bucket_name = 'stackvidhya'
s3_object_name = 'df.csv'
s3_res.Object(bucket_name, s3_object_name).put(Body=csv_buffer.getvalue())
print("Dataframe is saved as CSV in S3 bucket.")
For those who might have problems with S3FS or fsspec using Lambda:
You have to create a layer for each libary and insert them in your Lambda.
You can find how to crate a layer here.
I read a csv with two columns from bucket s3, and the content of the file csv i put in pandas dataframe.
Example:
config.json
{
"credential": {
"access_key":"xxxxxx",
"secret_key":"xxxxxx"
}
,
"s3":{
"bucket":"mybucket",
"key":"csv/user.csv"
}
}
cls_config.json
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import json
class cls_config(object):
def __init__(self,filename):
self.filename = filename
def getConfig(self):
fileName = os.path.join(os.path.dirname(__file__), self.filename)
with open(fileName) as f:
config = json.load(f)
return config
cls_pandas.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import io
class cls_pandas(object):
def __init__(self):
pass
def read(self,stream):
df = pd.read_csv(io.StringIO(stream), sep = ",")
return df
cls_s3.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import boto3
import json
class cls_s3(object):
def __init__(self,access_key,secret_key):
self.s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)
def getObject(self,bucket,key):
read_file = self.s3.get_object(Bucket=bucket, Key=key)
body = read_file['Body'].read().decode('utf-8')
return body
test.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from cls_config import *
from cls_s3 import *
from cls_pandas import *
class test(object):
def __init__(self):
self.conf = cls_config('config.json')
def process(self):
conf = self.conf.getConfig()
bucket = conf['s3']['bucket']
key = conf['s3']['key']
access_key = conf['credential']['access_key']
secret_key = conf['credential']['secret_key']
s3 = cls_s3(access_key,secret_key)
ob = s3.getObject(bucket,key)
pa = cls_pandas()
df = pa.read(ob)
print df
if __name__ == '__main__':
test = test()
test.process()

Categories

Resources