I'm getting a weird error when running the following code:
import os
import urllib.request
import pandas as pd
data_url = 'URL FROM GOOGLE DRIVE DOWNLOAD'
file_name = 'mask.pkl'
data_dir = os.path.join('tempdata', 'test')
file_path = os.path.join(data_dir, file_name)
gdrive_file(file_path, data_url, data_dir)
x = pd.read_pickle(file_path)
mask = x[0]
Where the data_url is a download link from Google Drive, but it's just a .pkl file, any .pkl you use to test this should throw the same error. The gdrive_file() function is defined as follows:
def gdrive_file(file_path, data_url, data_dir):
if file_path is True:
pass
if not os.path.isfile(file_path):
print('Fetching example data file')
os.makedirs(data_dir, exist_ok=True)
return urllib.request.urlretrieve(data_url, file_path)
Everything works great up to the point where I use pandas to read the .pkl file. I get the following error:
In [11]: pd.read_pickle(file_path)
#---------------------------------------------
UnpicklingError
Traceback (most recent call last)
<ipython-input-11-f996be11e8eb> in <module>
----> 1 pd.read_pickle(file_path)
~/anaconda3/envs/reborn/lib/python3.7/site-packages/pandas/io/pickle.py in read_pickle(filepath_or_buffer, compression)
180 # We want to silence any warnings about, e.g. moved modules.
181 warnings.simplefilter("ignore", Warning)
--> 182 return pickle.load(f)
183 except excs_to_catch:
184 # e.g.
UnpicklingError: invalid load key, '<'.
I've used this same code to open other data types, it's just having an issue with opening .pkl file using pd.read_pickle(file_path) and I'm not sure why.
Related
I am trying to use pick to save and load my ML models but I get an error. Here is the simplify version of my code to save my model:
import pickle
def test(x,y):
return x+y
filename = 'test.pkl'
pickle.dump(test, open(filename, 'wb'))
I can load the pickle file from the same notebook that I am creating it but if I close the notebook and try to load the pick in a new one with the below code:
import pickle
filename = 'test.pkl'
loaded_model = pickle.load(open(filename, 'rb'))
It gets me this error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[2], line 2
1 filename = 'test.pkl'
----> 2 loaded_model = pickle.load(open(filename, 'rb'))
AttributeError: Can't get attribute 'test' on <module '__main__'>
When I am running below python code, its generating PDF file with blank pages.
Although I am expecting it to generate PDF file as it's saved on MS Access report.
Can someone please let me know what I'm missing.
Below is my code
import win32com.client as win
import os
import time
s_current_working_directory = os.getcwd()
script = os.path.realpath(__file__)
def file_open(file_name):
if os.path.exists(file_name):
try:
os.rename(file_name, file_name)
return False
except:
print("File Open "+file_name)
time.sleep(2)
file_open(file_name)
return True
else:
return False
raise NameError
access = win.Dispatch("Access.Application")
access.visible = 1
db = access.OpenCurrentDatabase(os.path.join(s_current_working_directory, 'Database.accdb'))
access.DoCmd.OpenReport('ReportName',1)
if os.path.isfile(filename):
os.remove(filename)
access.DoCmd.OutputTo(3, 'ReportName', r'PDF Format (*.pdf)', r'export_path')
access.DoCmd.CloseDatabase
access.Quit()
access=None
I wanted to load a CSV file from a zipped folder from a URL into a Pandas DataFrame. I referred here and used the same solution as follows:
from urllib import request
import zipfile
# link to the zip file
link = 'https://cricsheet.org/downloads/'
# the zip file is named as ipl_csv2.zip
request.urlretrieve(link, 'ipl_csv2.zip')
compressed_file = zipfile.ZipFile('ipl_csv2.zip')
# I need the csv file named all_matches.csv from ipl_csv2.zip
csv_file = compressed_file.open('all_matches.csv')
data = pd.read_csv(csv_file)
data.head()
But after running the code, I'm getting an error as:
BadZipFile Traceback (most recent call last)
<ipython-input-3-7b7a01259813> in <module>
1 link = 'https://cricsheet.org/downloads/'
2 request.urlretrieve(link, 'ipl_csv2.zip')
----> 3 compressed_file = zipfile.ZipFile('ipl_csv2.zip')
4 csv_file = compressed_file.open('all_matches.csv')
5 data = pd.read_csv(csv_file)
~\Anaconda3\lib\zipfile.py in __init__(self, file, mode, compression, allowZip64, compresslevel, strict_timestamps)
1267 try:
1268 if mode == 'r':
-> 1269 self._RealGetContents()
1270 elif mode in ('w', 'x'):
1271 # set the modified flag so central directory gets written
~\Anaconda3\lib\zipfile.py in _RealGetContents(self)
1334 raise BadZipFile("File is not a zip file")
1335 if not endrec:
-> 1336 raise BadZipFile("File is not a zip file")
1337 if self.debug > 1:
1338 print(endrec)
BadZipFile: File is not a zip file
I'm not much used to zip file handling in Python. So please help me out here as to what correction do I need to make in my code?
If I open the URL https://cricsheet.org/downloads/ipl_csv2.zip in a web browser, the zip file gets automatically downloaded in my system. As data gets added daily in this zip file, I want to access the URL and directly get the CSV file via Python to save storage.
Edit1: If you guys have any other code solution, then please do share...
This is what I did after discussion with #nobleknight below:
# importing libraries
import zipfile
from urllib.request import urlopen
import shutil
import os
url = 'https://cricsheet.org/downloads/ipl_csv2.zip'
file_name = 'ipl_csv2.zip'
# extracting zipfile from URL
with urlopen(url) as response, open(file_name, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
# extracting required file from zipfile
with zipfile.ZipFile(file_name) as zf:
zf.extract('all_matches.csv')
# deleting the zipfile from the directory
os.remove('ipl_csv2.zip')
# loading data from the file
data = pd.read_csv('all_matches.csv')
This solution prevents the ContentTooShortError and the HTTPForbiddenError errors which I have been facing for every solution I find in the net. Thanks to #nobleknight for providing me a part of the solution with reference to this.
Any other thoughts are welcome.
Try this:
link = "https://cricsheet.org/downloads/ipl_csv2.zip"
No worry if file gets downloaded, cancel downloading if you don't want that file.
You will always get updated data from link.
I'm trying to get the below code to import multiple csv files (ALLOWANCE1.csv and ALLOWANCE2.csv) from a Google Cloud Bucket to Datalab in python 2.x:
import numpy as np
import pandas as pd
from google.datalab import Context
import google.datalab.bigquery as bq
import google.datalab.storage as storage
from io import BytesIO
myBucket = storage.Bucket('Bucket Name')
object_list = myBucket.objects(prefix='ALLOWANCE')
df_list = []
for obj in object_list:
%gcs read --object $obj.uri --variable data
df_list.append(pd.read_csv(BytesIO(data)))
concatenated_df = pd.concat(df_list, ignore_index=True)
concatenated_df.head()
I'm getting the following error right at the beginning of the for loop:
RequestExceptionTraceback (most recent call last)
<ipython-input-5-3188aab389b8> in <module>()
----> 1 for obj in object_list:
2 get_ipython().magic(u'gcs read --object $obj.uri --variable
data')
3 df_list.append(pd.read_csv(BytesIO(data)))
/usr/local/envs/py2env/lib/python2.7/site-
packages/google/datalab/utils/_iterator.pyc in __iter__(self)
34 """Provides iterator functionality."""
35 while self._first_page or (self._page_token is not None):
---> 36 items, next_page_token = self._retriever(self._page_token, self._count)
37
38 self._page_token = next_page_token
/usr/local/envs/py2env/lib/python2.7/site-packages/google/datalab/storage/_object.pyc in _retrieve_objects(self, page_token, _)
319 page_token=page_token)
320 except Exception as e:
--> 321 raise e
322
323 objects = list_info.get('items', [])
RequestException: HTTP request failed: Not Found
I have spent some time resolving this issue but no luck! Any help would be greatly appreciated!
I don't think you can mix the notebook shell commands with python variables. Perhaps try using the subprocess python lib and call the commandline commands using python.
import numpy as np
import pandas as pd
from google.datalab import Context
import google.datalab.bigquery as bq
import google.datalab.storage as storage
from io import BytesIO
#new line
from subprocess import call
from google.colab import auth #new lines
auth.authenticate_user()
myBucket = storage.Bucket('Bucket Name')
object_list = myBucket.objects(prefix='ALLOWANCE')
df_list = []
for obj in object_list:
call(['gsutil', 'cp', obj.uri, '/tmp/']) #first copy file
filename = obj.uri.split('/')[-1] #get file name
df_list.append(pd.read_csv('/tmp/' + filename))
concatenated_df = pd.concat(df_list, ignore_index=True)
concatenated_df.head()
note that I did not run this code but have run "call" with my own files successfully. Another suggestion is to first run the file copy calls in one loop prior to reading them. That way, if you iterate a lot with your data you're not re-downloading them each time.
I am trying to convert an Excel spreadsheet to PDF using Python and the comtypes package using this code:
import os
import comtypes.client
FORMAT_PDF = 17
SOURCE_DIR = 'C:/Users/IEUser/Documents/jscript/test/resources/root3'
TARGET_DIR = 'C:/Users/IEUser/Documents/jscript'
app = comtypes.client.CreateObject('Excel.Application')
app.Visible = False
infile = os.path.join(os.path.abspath(SOURCE_DIR), 'spreadsheet1.xlsx')
outfile = os.path.join(os.path.abspath(TARGET_DIR), 'spreadsheet1.pdf')
doc = app.Workbooks.Open(infile)
doc.SaveAs(outfile, FileFormat=FORMAT_PDF)
doc.Close()
app.Quit()
This script above runs fine and the pdf file is created, but when I try to open it I get the error "The file cannot be opened - there is a problem with the file format" (but after closing this error dialog it is actually possible to preview the pdf file). I have tried a similar script to convert Word documents to pdfs and this worked just fine.
Any ideas on how I can resolve this problem with the file format error?
Found a solution - this seems to be working:
import os
import comtypes.client
SOURCE_DIR = 'C:/Users/IEUser/Documents/jscript/test/resources/root3'
TARGET_DIR = 'C:/Users/IEUser/Documents/jscript'
app = comtypes.client.CreateObject('Excel.Application')
app.Visible = False
infile = os.path.join(os.path.abspath(SOURCE_DIR), 'spreadsheet1.xlsx')
outfile = os.path.join(os.path.abspath(TARGET_DIR), 'spreadsheet1.pdf')
doc = app.Workbooks.Open(infile)
doc.ExportAsFixedFormat(0, outfile, 1, 0)
doc.Close()
app.Quit()
This link may also be helpful as an inspiration regarding the arguments to the ExportAsFixedFormatfunction: Document.ExportAsFixedFormat Method (although some of the values of arguments have to be modified a bit).
You need to describe ExportAsFixedFormat(0,outputfile) to save workbook in pdf format. The solution from http://thequickblog.com/convert-an-excel-filexlsx-to-pdf-python/ works for me.
from win32com import client
import win32api
input_file = r'C:\Users\thequickblog\Desktop\Python session 2\tqb_sample.xlsx'
#give your file name with valid path
output_file = r'C:\Users\thequickblog\Desktop\Python session 2\tqb_sample_output.pdf'
#give valid output file name and path
app = client.DispatchEx("Excel.Application")
app.Interactive = False
app.Visible = False
Workbook = app.Workbooks.Open(input_file)
try:
Workbook.ActiveSheet.ExportAsFixedFormat(0, output_file)
except Exception as e:
print("Failed to convert in PDF format.Please confirm environment meets all the requirements and try again")
print(str(e))
finally:
Workbook.Close()
app.Exit()