SharePoint Excel URL to Python Pandas DataFrame -Streamlit - python

Issue: uploading large file to Streamlit-> need a workaround for file size related issues.
Is there a way to create a pandas df from just a file SharePoint file url link?
I solved it for Google Drive url link but cannot figure out SharePoint.
Potential Solution: Create a url link from SharePoint and load the excel/csv file in as a pandas df.
import pandas as pd
url = 'google drive url'
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
df = pd.read_csv(path)

yea you can use https://github.com/vgrem/Office365-REST-Python-Client
download_path = os.path.join(tempfile.mkdtemp(), os.path.basename(FILE_URL))
with open(download_path, "wb") as local_file:
ctx.web.get_file_by_server_relative_url(FILE_URL).download(local_file).execute_query()
then read the download_path
df = pd.read_csv(downloadpath)
don't for get to del out the temp file !
The Library is amazing, you can also read the sharepoint file directly in bytes
Ex:
def read_csv(ctx, relative_url, pandas=False):
# relative_url = "/sites/myLib/Folder/test.csv" #TEST
# ctx = auth()
response = File.open_binary(ctx, relative_url)
bytes_data = response.content
try:
s = str(bytes_data, 'utf8')
except Exception as e:
print('utf8 encoding error')
print(relative_url, e)
try:
s = str(bytes_data, 'cp1252')
except Exception as e:
print('CRITIAL ERROR cp1252 encoding error')
print(relative_url, e)
if pandas == False:
return s
else:
data = StringIO(s)
return data
I use panadas variable bc my final code looks like
df= pd.read_csv(read_csv(ctx=ctx, relative_url=FILE_URL, pandas=True), dtype=str, keep_default_na=False) # read master qrd db

Related

How to extract application/zip from api response?

I have got an application/octect-stream with a application/zip as body in requests.Response object returned from an api call with a csv file inside it. I am trying to read the csv file to pandas without writing to the disk, if possible.
And if I want to write the zip file to a path as a zip file, how can I do that?
resp = requests.get(url, headers=headers)
resp.raise_for_status()
csv_obj = zlib.decompress(resp.content, wbits=zlib.MAX_WBITS|32)
print(type(csv_obj))
export_file = pd.read_csv(csv_obj)
export_file.to_csv('./Test_export.csv')
Updated version
# step 1: it turns out pandas can read zipped csv files even from urls!
some_dataframe = pandas.read_csv(url)
If pandas can't figure it out by itself there are some parameters you can try to massage.
# step 1: it turns out pandas can read zipped csv files even from urls!
some_dataframe = pandas.read_csv(zip_filename, compression='zip', header=0) # etc..
Previous version
I will leave the previous version of my answer below for reference.
# step 1: downloading the zip file
zip_filename = 'response.zip'
with open(zip_filename, 'wb') as zip_file:
for chunk in response.iter_content(chunk_size=255):
if chunk:
zip_file.write(chunk)
# step 2: turns out pandas can read zipped csv files!
some_dataframe = pandas.read_csv(zip_filename)
import pandas as pd
import io
import zipfile
resp = requests.get(url, headers=headers, stream=True)
resp.raise_for_status()
zfile = zipfile.ZipFile(io.BytesIO(resp.content))
# I only had one file, so calling zfile.namelist
export_file = pd.read_csv(zfile.open(f'{zfile.namelist()[-1]}'))

Python - how to read Sharepoint excel sheet specific worksheet

In Python I am utilizing Office 365 REST Python Client library to access and read an excel workbook that contains many sheets.
While the authentication is successful, I am unable to append the right path of sheet name to the file name in order to access the 1st or 2nd worksheet by its name, which is why the output from the sheet is not JSON, rather IO Bytes which my code is not able to process.
My end goal is to simply access the specific work sheet by its name 'employee_list' and transform it into JSON or Pandas Data frame for further usage.
Code snippet below -
import io
import json
import pandas as pd
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.runtime.auth.user_credential import UserCredential
from office365.runtime.http.request_options import RequestOptions
from office365.sharepoint.client_context import ClientContext
from office365.sharepoint.files.file import File
from io import BytesIO
username = 'abc#a.com'
password = 'abcd'
site_url = 'https://sample.sharepoint.com/sites/SAMPLE/_layouts/15/Doc.aspx?OR=teams&action=edit&sourcedoc={739271873}'
# HOW TO ACCESS WORKSHEET BY ITS NAME IN ABOVE LINE
ctx = ClientContext(site_url).with_credentials(UserCredential(username, password))
request = RequestOptions("{0}/_api/web/".format(site_url))
response = ctx.execute_request_direct(request)
json_data = json.loads(response.content) # ERROR ENCOUNTERED JSON DECODE ERROR SINCE DATA IS IN BYTES
You can access it by sheet index, check the following code....
import xlrd
loc = ("File location")
wb = xlrd.open_workbook(loc)
sheet = wb.sheet_by_index(0)
# For row 0 and column 0
print(sheet.cell_value(1, 0))
You can try to add the component 'sheetname' to the url like so.
https://site/lib/workbook.xlsx#'Sheet1'!A1
It seems that URL constructed to access data is not correct. You should test full URL in your browser as working and then modify code to get going. You may try this with some changes, I have verified that URL formed with this logic would return JSON data.
import io
import json
import pandas as pd
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.runtime.auth.user_credential import UserCredential
from office365.runtime.http.request_options import RequestOptions
from office365.sharepoint.client_context import ClientContext
from office365.sharepoint.files.file import File
from io import BytesIO
username = 'abc#a.com'
password = 'abcd'
site_url = 'https://sample.sharepoint.com/_vti_bin/ExcelRest.aspx/RootFolder/ExcelFileName.xlsx/Model/Ranges('employee_list!A1%7CA10')?$format=json'
# Replace RootFolder/ExcelFileName.xlsx with actual path of excel file from the root.
# Replace A1 and A10 with actual start and end of cell range.
ctx = ClientContext(site_url).with_credentials(UserCredential(username, password))
request = RequestOptions(site_url)
response = ctx.execute_request_direct(request)
json_data = json.loads(response.content)
Source: https://learn.microsoft.com/en-us/sharepoint/dev/general-development/sample-uri-for-excel-services-rest-api
The update I'm using (Office365-REST-Python-Client==2.3.11) allows simpler access to an Excel file in the SharePoint repository.
# from original_question import pd,\
# username,\
# password,\
# UserCredential,\
# File,\
# BytesIO
user_credentials = UserCredential(user_name=username,
password=password)
file_url = ('https://sample.sharepoint.com'
'/sites/SAMPLE/{*recursive_folders}'
'/sample_worksheet.xlsx')
## absolute path of excel file on SharePoint
excel_file = BytesIO()
## initiating binary object
excel_file_online = File.from_url(abs_url=file_url)
## requesting file from SharePoint
excel_file_online = excel_file_online.with_credentials(
credentials=user_credentials)
## validating file with accessible credentials
excel_file_online.download(file_object=excel_file).execute_query()
## writing binary response of the
## file request into bytes object
We now have a binary copy of the Excel file as BytesIO named excel_file. Progressing, reading it as pd.DataFrame is straight-forward like usual Excel file stored in local drive. Eg.:
pd.read_excel(excel_file) # -> pd.DataFrame
Hence, if you are interested in a specific sheet like 'employee_list', you may preferably read it as
employee_list = pd.read_excel(excel_file,
sheet_name='employee_list')
# -> pd.DataFrame
or
data = pd.read_excel(excel_file,
sheet_name=None) # -> dict
employee_list = data.get('employee_list')
# -> [pd.DataFrame, None]
I know you stated you can't use a BytesIO object, but for those coming here who are reading the file in as a BytesIO object like I was looking for, you can use the sheet_name arg in pd.read_excel:
url = "https://sharepoint.site.com/sites/MySite/MySheet.xlsx"
sheet_name = 'Sheet X'
response = File.open_binary(ctx, relative_url)
bytes_file_obj = io.BytesIO()
bytes_file_obj.write(response.content)
bytes_file_obj.seek(0)
df = pd.read_excel(bytes_file_obj, sheet_name = sheet_name) //call sheet name

Azure Databricks - Reading Parquet files into DataFrames

Am newbie with Python ... trying to read parquet files from Databricks, but when the file is empty is throwing error. How can i check filesize before reading it into DataFrame. Code below:
%python
##check if file is empty ???
##if not empty read
##else do something else
try:
parquetDF =
spark.read.parquet("wasbs://XXXXX#XXXX.blob.core.windows.net/XXXX/2019-10- 11/account.parquet")
except:
print('File is Empty !!!')
For now am doing handing this as below
%python
import pandas as pd
data = {
'Dummy': ['Dummy'],
}
parquetDF = pd.DataFrame(data)
try:
parquetDF = spark.read.parquet("wasbs://XXXXX#XXXXX.blob.core.windows.net/XXXXX/2019-10-11/account.parquet")
except:
print('Empty File!!!')
if (parquetDF.columns[0] == 'Dummy'):
print('Do Nothing !!!!')
else:
print('Do Something !!!')
Creating Dummy DataFrame, then trying to load the DataFrame with parquet Data. If any exceptions / source file is empty DF will not be loaded. Then check if the DF is loaded or not and process accordingly.
Also tried to read filesize, but getting exception 'No such file or directory'
%python
import os
statinfo = os.stat("wasbs://XXXXX#XXXXX.blob.core.windows.net/XXXXX/2019-10-11/account.parquet")
statinfo

Format file into an Excel sheet

I am a new programmer in python, and I need your help. If I load the following url in chrome https://api.mysportsfeeds.com/v1.1/pull/nhl/2016-2017-regular/cumulative_player_stats.{format}, where {format} could be csv or json format, then once downloaded and open, the file is already well formatted. How could I convert it into an Excel sheet in using pandas or openpyxl?
UPDATE
import base64
import requests
import json
USERNAME, PASSWORD = 'notworking', 'notworking'
def send_request():
# Request
try:
response = requests.get(
url="https://api.mysportsfeeds.com/v1.1/pull/nhl/2017-2018-regular/cumulative_player_stats.csv",
params={
"fordate": "20171009"
},
headers={
"Authorization": "Basic " +
base64.b64encode('{}:{}'.format(USERNAME,PASSWORD)\
.encode('utf-8')).decode('ascii')
}
)
print('Response HTTP Status Code: {status_code}'.format(
status_code=response.status_code))
print('Response HTTP Response Body: {content}'.format(
content=response.content))
except requests.exceptions.RequestException:
print('HTTP Request failed')
return response
import pandas as pd
import io
test = send_request().content
df = pd.read_csv(io.StringIO(test.decode('utf-8')))
writer = pd.ExcelWriter('/home/jeremie/Projects/Work_Projects/NHL_project/output.xls')
df.to_excel(writer, 'Sheet1')
I am struggling with the fact that my code seems working, but no file have been created.
You need to add a final .save() to close the Pandas Excel writer and then output the Excel file, e.g.
import pandas as pd
import io
import os
# Set the working folder to the same folder as the script
os.chdir(os.path.dirname(os.path.abspath(__file__)))
test = send_request().content
df = pd.read_csv(io.StringIO(test.decode('utf-8')))
writer = pd.ExcelWriter('output.xls')
df.to_excel(writer, 'Sheet1')
writer.save()
By setting the working folder this way, it should work equally well on a Windows PC.

.xlsx and xls(Latest Versions) to pdf using python

With the help of this .doc to pdf using python
Link I am trying for excel (.xlsx and xls formats)
Following is modified Code for Excel:
import os
from win32com import client
folder = "C:\\Oprance\\Excel\\XlsxWriter-0.5.1"
file_type = 'xlsx'
out_folder = folder + "\\PDF_excel"
os.chdir(folder)
if not os.path.exists(out_folder):
print 'Creating output folder...'
os.makedirs(out_folder)
print out_folder, 'created.'
else:
print out_folder, 'already exists.\n'
for files in os.listdir("."):
if files.endswith(".xlsx"):
print files
print '\n\n'
word = client.DispatchEx("Excel.Application")
for files in os.listdir("."):
if files.endswith(".xlsx") or files.endswith('xls'):
out_name = files.replace(file_type, r"pdf")
in_file = os.path.abspath(folder + "\\" + files)
out_file = os.path.abspath(out_folder + "\\" + out_name)
doc = word.Workbooks.Open(in_file)
print 'Exporting', out_file
doc.SaveAs(out_file, FileFormat=56)
doc.Close()
It is showing following error :
>>> execfile('excel_to_pdf.py')
Creating output folder...
C:\Excel\XlsxWriter-0.5.1\PDF_excel created.
apms_trial.xlsx
~$apms_trial.xlsx
Exporting C:\Excel\XlsxWriter-0.5.1\PDF_excel\apms_trial.pdf
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "excel_to_pdf.py", line 30, in <module>
doc = word.Workbooks.Open(in_file)
File "<COMObject <unknown>>", line 8, in Open
pywintypes.com_error: (-2147352567, 'Exception occurred.', (0, u'Microsoft Excel
', u"Excel cannot open the file '~$apms_trial.xlsx' because the file format or f
ile extension is not valid. Verify that the file has not been corrupted and that
the file extension matches the format of the file.", u'xlmain11.chm', 0, -21468
27284), None)
>>>
There is problem in
doc.SaveAs(out_file, FileFormat=56)
What should be FileFormat file format?
Please Help
Link of xlsxwriter :
https://xlsxwriter.readthedocs.org/en/latest/contents.html
With the help of this you can generate excel file with .xlsx and .xls
for example excel file generated name is trial.xls
Now if you want to generate pdf of that excel file then do the following :
from win32com import client
xlApp = client.Dispatch("Excel.Application")
books = xlApp.Workbooks.Open('C:\\excel\\trial.xls')
ws = books.Worksheets[0]
ws.Visible = 1
ws.ExportAsFixedFormat(0, 'C:\\excel\\trial.pdf')
I got the same thing and the same error... ANSWER: 57.... see below...
from win32com import client
import win32api
def exceltopdf(doc):
excel = client.DispatchEx("Excel.Application")
excel.Visible = 0
wb = excel.Workbooks.Open(doc)
ws = wb.Worksheets[1]
try:
wb.SaveAs('c:\\targetfolder\\result.pdf', FileFormat=57)
except Exception, e:
print "Failed to convert"
print str(e)
finally:
wb.Close()
excel.Quit()
... as an alternative to the fragile ExportAsFixedFormat...
You can print an excel sheet to pdf on linux using python.
Do need to run openoffice as a headless server and use unoconv, takes a bit of configuring but is doable
You run OO as a (service) daemon and use it for the conversions for xls, xlsx and doc, docx.
http://dag.wiee.rs/home-made/unoconv/
Another solution for
Is to start gotenberg docker container locally
https://github.com/gotenberg/gotenberg
And pass (any supported by libreoffice) file from python wia HTTP to the container and get result as pdf
LIBREOFFICE_URL = 'http://localhost:3000/forms/libreoffice/convert'
LIBREOFFICE_LANDSCAPE_URL = 'http://localhost:3000/forms/libreoffice/convert?landscape=1'
def _retry_gotenberg(url, io_bytes, post_file_name='index.html'):
response = None
for _ in range(5):
response = requests.post(url, files={post_file_name: io_bytes})
if response.status_code == 200:
break
logging.info('Will sleep and retry: %s %s', response.status_code, response.content)
sleep(3)
if not response or response.status_code != 200:
raise RuntimeRrror(f'Bad response from doc-to-pdf: {response.status_code} {response.content}')
return response
def process_libreoffice(io_bytes, ext: str):
if ext in ('.doc', '.docx'):
url = LIBREOFFICE_URL
else:
url = LIBREOFFICE_LANDSCAPE_URL
response = self._retry_gotenberg(url, io_bytes, post_file_name=f'file.{ext}')
return response.content
The GroupDocs.Conversion Cloud SDK for Python is another option to convert Excel to PDF. It is paid API. However, it provides 150 free monthly API calls.
P.S: I'm a developer evangelist at GroupDocs.
# Import module
import groupdocs_conversion_cloud
from shutil import copyfile
# Get your client_id and client_key at https://dashboard.groupdocs.cloud (free registration is required).
client_id = "xxxxxx-xxxx-xxxx-xxxx-xxxxxxxxx"
client_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# Create instance of the API
convert_api = groupdocs_conversion_cloud.ConvertApi.from_keys(client_id, client_key)
try:
#Convert PDF to PNG
# Prepare request
request = groupdocs_conversion_cloud.ConvertDocumentDirectRequest("pdf", "C:/Temp/Book1.xlsx")
# Convert
result = convert_api.convert_document_direct(request)
copyfile(result, 'C:/Temp/Book1_output.pdf')
print("Result {}".format(result))
except groupdocs_conversion_cloud.ApiException as e:
print("Exception when calling get_supported_conversion_types: {0}".format(e.message))

Categories

Resources