Pandas read_pickle from s3 bucket - python

I am working on a Jupyter notebook from AWS EMR.
I am able to do this:
pd.read_csv("s3:\\mypath\\xyz.csv').
However, if I try to open a pickle file like this, pd.read_pickle("s3:\\mypath\\xyz.pkl")
I am getting this error:
[Errno 2] No such file or directory: 's3://pvarma1/users/users/candidate_users.pkl'
Traceback (most recent call last):
File "/usr/local/lib64/python2.7/site-packages/pandas/io/pickle.py", line 179, in read_pickle
return try_read(path)
File "/usr/local/lib64/python2.7/site-packages/pandas/io/pickle.py", line 177, in try_read
lambda f: pc.load(f, encoding=encoding, compat=True))
File "/usr/local/lib64/python2.7/site-packages/pandas/io/pickle.py", line 146, in read_wrapper
is_text=False)
File "/usr/local/lib64/python2.7/site-packages/pandas/io/common.py", line 421, in _get_handle
f = open(path_or_buf, mode)
IOError: [Errno 2] No such file or d
However, I can see both xyz.csv and xyz.pkl in the same path! Can anyone help?

Pandas read_pickle supports only local paths, unlike read_csv. So you should be copying the pickle file to your machine before reading it in pandas.

Since read_pickle does not support this, you can use smart_open:
from smart_open import open
s3_file_name = "s3://bucket/key"
with open(s3_file_name, 'rb') as f:
df = pd.read_pickle(f)

Related

"OSError: [Errno 9] Bad file descriptor" error when trying to save excel file with openpyxl in python

I have a couple of excel files I want to merge into one.
I need the second column on all the files to be copied into separate columns in a new Microsoft Excel file.
For this, I am using the openpyxl library in a python script.
This is my code:
import os
from openpyxl import load_workbook
def mergeDataFiles():
path = "C:\\Users\\ethan\\Desktop\\Benzoyl Chloride\\Benzoyl Chloride"
# source excel files
origin_files = list()
for path, subdirs, files in os.walk(path):
for file_index in range(len(files)):
origin_files.append(files[file_index])
# destination excel file
destination_file = path + ".xlsx"
destination_workbook = load_workbook(destination_file)
destination_sheet = destination_workbook["Sheet1"]
# copy data from source files to destination file
for origin_file_index in range(1, len(origin_files)):
origin_workbook = load_workbook(path + "\\" + origin_files[origin_file_index - 1])
origin_sheet = origin_workbook['Data']
destination_sheet.cell(row=1, column=origin_file_index).value = origin_files[origin_file_index - 1]
for i in range(1, 500):
# read cell value from source excel file
data = origin_sheet.cell(row=i, column=2)
# write the value to destination excel file
destination_sheet.cell(row=i + 1, column=origin_file_index).value = data.value
# saving the destination excel file
destination_workbook.save(destination_file)
if __name__ == "__main__":
mergeDataFiles()
When I run the code, I get an error on the last line in the function: OSError: [Errno 9] Bad file descriptor.
Full traceback:
C:\Users\ethan\.venv\Scripts\python.exe "C:/Users/ethan/Coding/Python/Copy Excel Data/main.py"
Traceback (most recent call last):
File "C:\Users\ethan\Coding\Python\Copy Excel Data\main.py", line 32, in <module>
mergeDataFiles()
File "C:\Users\ethan\Coding\Python\Copy Excel Data\main.py", line 28, in mergeDataFiles
destination_workbook.save(destination_file)
File "C:\Users\ethan\.venv\Lib\site-packages\openpyxl\workbook\workbook.py", line 407, in save
save_workbook(self, filename)
File "C:\Users\ethan\.venv\Lib\site-packages\openpyxl\writer\excel.py", line 293, in save_workbook
writer.save()
File "C:\Users\ethan\.venv\Lib\site-packages\openpyxl\writer\excel.py", line 275, in save
self.write_data()
File "C:\Users\ethan\.venv\Lib\site-packages\openpyxl\writer\excel.py", line 67, in write_data
archive.writestr(ARC_APP, tostring(props.to_tree()))
File "C:\Program Files\Python311\Lib\zipfile.py", line 1830, in writestr
with self.open(zinfo, mode='w') as dest:
File "C:\Program Files\Python311\Lib\zipfile.py", line 1204, in close
self._fileobj.seek(self._zinfo.header_offset)
OSError: [Errno 9] Bad file descriptor
Exception ignored in: <function ZipFile.__del__ at 0x000001D101443D80>
Traceback (most recent call last):
File "C:\Program Files\Python311\Lib\zipfile.py", line 1870, in __del__
self.close()
File "C:\Program Files\Python311\Lib\zipfile.py", line 1892, in close
self._fpclose(fp)
File "C:\Program Files\Python311\Lib\zipfile.py", line 1992, in _fpclose
fp.close()
OSError: [Errno 9] Bad file descriptor
Process finished with exit code 1
I have tried changing the file names and locations, having the destination file open and closed, scouring the internet for solutions and at this point I'm not sure what else I can try.
I am running the code on Windows 10 22H2, with an intel i5 cpu.
Please assist me with this issue, if you know how to solve it.

How to export .csv file from python and using pandas DataFrame

I am trying to export some filtered data from Python using Pandas DF to .csv file (Personal Learning project)
Code : df5.to_csv(r'/C:/Users/j/Downloads/data1/export.csv')
Error:
Traceback (most recent call last):
File "C:\Users\jansa\PycharmProjects\bbb\main.py", line 62, in <module>
df5.to_csv(r'/C:/Users/jansa/Downloads/data1/export.csv')
File "C:\Users\jansa\PycharmProjects\bbb\venv\lib\site-packages\pandas\core\generic.py", line 3551, in to_csv
return DataFrameRenderer(formatter).to_csv(
File "C:\Users\jansa\PycharmProjects\bbb\venv\lib\site-packages\pandas\io\formats\format.py", line 1180, in to_csv
csv_formatter.save()
File "C:\Users\jansa\PycharmProjects\bbb\venv\lib\site-packages\pandas\io\formats\csvs.py", line 241, in save with get_handle(
File "C:\Users\jansa\PycharmProjects\bbb\venv\lib\site-packages\pandas\io\common.py", line 697, in get_handle
check_parent_directory(str(handle))
File "C:\Users\jansa\PycharmProjects\bbb\venv\lib\site-packages\pandas\io\common.py", line 571, in check_parent_directory
raise OSError(rf"Cannot save file into a non-existent directory: '{parent}'")
OSError: Cannot save file into a non-existent directory: '\C:\Users\jansa\Downloads\data1'
I am researching, but cannot pinpoint the error.
Try
df.to_csv(r'C:\path\to\directory\filename.csv')
Generally, in Linux/Mac environment path separator is '/' but in windows, it is '\'. Also, the absolute path starts with '/' in Linux/Mac, while in windows, it starts with / So, using arguments in to_csv with C:\Users\j\Downloads\data1\export.csv' will resolve your issue.
In addition, if you want to get rid of such situations, you can do this:
import os
path = os.path.join('.', 'export.csv') #will save the file in current directory
Also, this returns the os path separator:
print(os.sep)

Why can't I access the excel file in python?

I'm trying to use some data that I have in an excel file. However, I'm getting an error saying that it doesn't find the file. I've looked up and the directory and the file name are correct, What am I doing wrong?
Here is the code:
import os
import pandas as pd
print(os.getcwd())
df = pd.read_excel(r'C:/Users/Eder/Desktop/TFG/Data/Interpolation_sample.xlsx',
index_col =0,parse_dates=True, sheet_name='sheet3')
And the answer from the console:
runcell(0, 'C:/Users/Eder/untitled0.py')
C:\Users\Eder\Desktop\TFG\Data
Traceback (most recent call last):
File "C:\Users\Eder\untitled0.py", line 14, in <module>
index_col =0,parse_dates=True, sheet_name='sheet3')
File "E:\Anaconda3\lib\site-packages\pandas\util\_decorators.py", line 299, in wrapper
return func(*args, **kwargs)
File "E:\Anaconda3\lib\site-packages\pandas\io\excel\_base.py", line 336, in read_excel
io = ExcelFile(io, storage_options=storage_options, engine=engine)
File "E:\Anaconda3\lib\site-packages\pandas\io\excel\_base.py", line 1072, in __init__
content=path_or_buffer, storage_options=storage_options
File "E:\Anaconda3\lib\site-packages\pandas\io\excel\_base.py", line 950, in inspect_excel_format
content_or_path, "rb", storage_options=storage_options, is_text=False
File "E:\Anaconda3\lib\site-packages\pandas\io\common.py", line 651, in get_handle
handle = open(handle, ioargs.mode)
FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Eder\\Desktop\\TFG\\Data\\Interpolation_sample.xlsx'
I've figured out a way to solve the problem. I just changed the name of the file from 'Interpolation_sample' to 'Interpolation sample'. I don't know why, but the underscore in the file name is what was causing this error.

IOE error using dataframe.to_csv and dataframe.save to write data to files

I was trying to save dataframe for later use in pandas.
However, I had the error below.
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/source/Linux/pkg/python-2.7.3/lib/python2.7/site-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/series.py", line 2881, in to_csv
encoding=encoding)
File "/source/Linux/pkg/python-2.7.3/lib/python2.7/site-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py", line 1393, in to_csv
formatter.save()
File "/source/Linux/pkg/python-2.7.3/lib/python2.7/site-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py", line 963, in save
f.close()
IOError: [Errno 5] Input/output error
dataframe.save fails even for a simple object a = DataFrame({'a':[1,3,4],'b':[3,4,5]}).
The save method is deprecated. You should us to_pickle instead. It looks like you're using pandas 0.11 which quite old. The latest version is 0.16.
You could also consider saving it to csv or HDF5.
http://pandas.pydata.org/pandas-docs/stable/io.html

pandas HDFStore - how to reopen?

I created a file by using:
store = pd.HDFStore('/home/.../data.h5')
and stored some tables using:
store['firstSet'] = df1
store.close()
I closed down python and reopened in a fresh environment.
How do I reopen this file?
When I go:
store = pd.HDFStore('/home/.../data.h5')
I get the following error.
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/misc/apps/linux/python-2.6.1/lib/python2.6/site-packages/pandas-0.10.0-py2.6-linux-x86_64.egg/pandas/io/pytables.py", line 207, in __init__
self.open(mode=mode, warn=False)
File "/misc/apps/linux/python-2.6.1/lib/python2.6/site-packages/pandas-0.10.0-py2.6-linux-x86_64.egg/pandas/io/pytables.py", line 302, in open
self.handle = _tables().openFile(self.path, self.mode)
File "/apps/linux/python-2.6.1/lib/python2.6/site-packages/tables/file.py", line 230, in openFile
return File(filename, mode, title, rootUEP, filters, **kwargs)
File "/apps/linux/python-2.6.1/lib/python2.6/site-packages/tables/file.py", line 495, in __init__
self._g_new(filename, mode, **params)
File "hdf5Extension.pyx", line 317, in tables.hdf5Extension.File._g_new (tables/hdf5Extension.c:3039)
tables.exceptions.HDF5ExtError: HDF5 error back trace
File "H5F.c", line 1582, in H5Fopen
unable to open file
File "H5F.c", line 1373, in H5F_open
unable to read superblock
File "H5Fsuper.c", line 334, in H5F_super_read
unable to find file signature
File "H5Fsuper.c", line 155, in H5F_locate_signature
unable to find a valid file signature
End of HDF5 error back trace
Unable to open/create file '/home/.../data.h5'
What am I doing wrong here? Thank you.
In my hands, following approach works best:
df = pd.DataFrame(...)
"write"
with pd.HDFStore('test.h5', mode='w') as store:
store.append('df', df, data_columns= df.columns, format='table')
"read"
with pd.HDFStore('test.h5', mode='r') as newstore:
df_restored = newstore.select('df')
You could try doing instead:
store = pd.io.pytables.HDFStore('/home/.../data.h5')
df1 = store['firstSet']
or use the read method directly:
df1 = pd.read_hdf('/home/.../data.h5', 'firstSet')
Either way, you should have pandas 0.12.0 or higher...
I had the same problem and finally fixed it by installing the pytables module (next to the pandas modules which I was using):
conda install pytables
which got me numexpr-2.4.3 and pytables-3.2.0
After that it worked. I am using pandas 0.16.2 under python 2.7.9

Categories

Resources