I am using pandas library to combine excel sheets into bytesIO memory, from different files, and write them to another combined sheets excel file as follows:
# Bucket
output = io.BytesIO()
bucket = "MYBUCKET"
filepath = "test.xlsx"
# dir
current_dir = os.getcwd()
root_path = current_dir.replace("/src", "")
# logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s -> [%(levelname)s]: %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
def get_sheet_names(path):
# Returns all sheet names
xls = pd.ExcelFile(path)
return xls.sheet_names
def add_sheets(metadata_path, sample_path):
filename = sample_path.split("/")[-1]
filename_slugify = f"_metadata_{filename.lower()}_{date.today()}.xlsx"
# create a Pandas Excel writer using XlsxWriter as the engine
logger.info(f"root path: {root_path}")
output_path = f"{root_path}/{filename_slugify}"
writer = pd.ExcelWriter(output_path, engine='xlsxwriter')
# Metadata
metadata_sheets = get_sheet_names(metadata_path)
logger.info(f"Metadata sheets: {metadata_sheets}")
for i, sheet in enumerate(metadata_sheets):
logger.info(f"Sheet number '{i+1}' is in progress: {sheet}")
df = pd.read_excel(metadata_path, sheet_name=i, header=None) # Create dataframe from sheet
# write each DataFrame to a specific sheet
df.to_excel(writer, sheet_name=sheet, index=False, header=False)
# Sample data
sample_sheets = get_sheet_names(sample_path)
logger.info(f"Sample data sheets: {sample_sheets}")
for i, sheet in enumerate(sample_sheets):
logger.info(f"Sheet number '{i+1}' is in progress: {sheet}")
df = pd.read_excel(sample_path, sheet_name=i, header=None) # Create dataframe from sheet
# write each DataFrame to a specific sheet
df.to_excel(writer, sheet_name=sheet, index=False, header=False)
logger.info(f'{output_path} \n Exporting {filename_slugify}...')
writer.save()
output.seek(0)
logger.info(f"Adding level value for '{filename}' has finished.")
data = output.getvalue()
s3 = boto3.resource('s3')
s3.Bucket(bucket).put_object(Key=filepath, Body=data)
if __name__ == "__main__":
add_sheets("s3://" + "MYBUCKET" + f"METADATAFILE_PATH",
"s3://" + "MYBUCKET" + f"SAMPLEDATAFILE_PATH")
The output file comes empty, with no data or sheets, when debugging the dataframes, I see the read operations are working well, the writer is the one who missing somthing.
Related
I am trying to save a pandas dataframe as an excel table to a sharepoint site. I have two separate blocks of code which achieve the below.(thanks for Stackoverflow community)
A script which can save a pandas df as excel table using ExcelWriter on local storage.
A Script which can save a local file to Sharepoint online.
I am very confused on how to combine these two to save a df to sharepoint online but the excel file should be a table not just a range of data. Kindly help
SCRIPT 1 to save excel range as table on local
##############################################################################
#
# An example of adding a dataframe to an worksheet table in an xlsx file
# using Pandas and XlsxWriter.
#
# Tables in Excel are used to group rows and columns of data into a single
# structure that can be referenced in a formula or formatted collectively.
#
# SPDX-License-Identifier: BSD-2-Clause
# Copyright 2013-2021, John McNamara, jmcnamara#cpan.org
#
import pandas as pd
# Create a Pandas dataframe from some data.
df = pd.DataFrame({
'Country': ['China', 'India', 'United States', 'Indonesia'],
'Population': [1404338840, 1366938189, 330267887, 269603400],
'Rank': [1, 2, 3, 4]})
# Order the columns if necessary.
df = df[['Rank', 'Country', 'Population']]
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('pandas_table.xlsx', engine='xlsxwriter')
# Write the dataframe data to XlsxWriter. Turn off the default header and
# index and skip one row to allow us to insert a user defined header.
df.to_excel(writer, sheet_name='Sheet1', startrow=1, header=False, index=False)
# Get the xlsxwriter workbook and worksheet objects.
workbook = writer.book
worksheet = writer.sheets['Sheet1']
# Get the dimensions of the dataframe.
(max_row, max_col) = df.shape
# Create a list of column headers, to use in add_table().
column_settings = [{'header': column} for column in df.columns]
# Add the Excel table structure. Pandas will add the data.
worksheet.add_table(0, 0, max_row, max_col - 1, {'columns': column_settings})
# Make the columns wider for clarity.
worksheet.set_column(0, max_col - 1, 12)
# Close the Pandas Excel writer and output the Excel file.
writer.save()
SCRIPT 2 to save any file to sharepoint
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.client_context import ClientContext
import os
baseurl = 'https://testsite.sharepoint.com/'
basesite = '/sites/project' # every share point has a home.
siteurl = baseurl + basesite
localpath = "pandas_table.xlsx"
remotepath = "Shared Documents/General/file.xlsx" # existing folder path under sharepoint site.
ctx_auth = AuthenticationContext(siteurl)
ctx_auth.acquire_token_for_user(<username>, <password>)
ctx = ClientContext(siteurl, ctx_auth) # make sure you auth to the siteurl.
with open(localpath, 'rb') as content_file:
file_content = content_file.read()
dir, name = os.path.split(remotepath)
file = ctx.web.get_folder_by_server_relative_url(dir).upload_file(name, file_content).execute_query()
I am not really sure how I can use the writer.save() with the sharepoint connector. Kindly advise
thanks in advance.
Given that you have your data in a df the following code will write to sharepoint using the O365 library.
from io import BytesIO
from tempfile import gettempdir
from O365 import Account, FileSystemTokenBackend
import pandas as pd
O365_CLIENT_ID = "client"
O365_SECRET = "secret"
O365_TENANT_ID = "<name>"
O365_SHAREPOINT = "<name>.sharepoint.com"
O365_SITE = "/sites/..."
def save_file(folder_path, filename, data):
"""save file to O365."""
account = Account(
(O365_CLIENT_ID, O365_SECRET),
auth_flow_type="credentials",
tenant_id=O365_TENANT_ID,
token_backend=FileSystemTokenBackend(
token_path=gettempdir(), token_filename="o365_token.txt"
),
)
if account.authenticate():
drive = (
account.sharepoint()
.get_site(O365_SHAREPOINT, O365_SITE)
.get_default_document_library()
)
subfolders = folder_path.split("/")
if len(subfolders) != 0:
items = drive.get_items()
for subfolder in subfolders:
try:
subfolder_drive = list(filter(lambda x, sf=subfolder: sf in x.name, items))[0]
items = subfolder_drive.get_items()
except Exception as excep: # pylint: disable=broad-except
raise f"Path {folder_path} does not exist." from excep
else:
subfolder_drive = drive.get_root_folder()
subfolder_drive.upload_file(
item=None,
item_name=filename,
stream=data,
stream_size=data.getbuffer().nbytes,
)
with BytesIO() as buf:
df.to_excel(buf, index=False)
buf.seek(0)
save_file('folder/to/upload/to', 'filename.xlsx', buf)
Writing to multiple sheets:
with BytesIO() as buf:
with pd.ExcelWriter( # pylint: disable=abstract-class-instantiated
buf,
engine="xlsxwriter",
) as writer:
df.to_excel(
writer,
sheet_name='sheet1',
index=False,
)
df.to_excel(
writer,
sheet_name='sheet2',
index=False,
)
buf.seek(0)
save_file('folder/to/upload/to', 'filename.xlsx', buf)
How can I append a row at the top of an excel sheet? Goal as follows:
The file itself is written by using pandas.df.to_excel as follows:
import pandas
with pandas.ExcelWriter(output_filename) as writer:
for file in files:
df = pandas.read_csv(file)
df.to_excel(writer, sheet_name=file.replace(".csv", "").replace("_", " ").title(), index=False)
Here is one way to do it using XlsxWriter as the Excel engine:
with pandas.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
for file in files:
df = pandas.read_csv(file)
sheet_name = file.replace(".csv", "").replace("_", " ").title()
df.to_excel(writer, sheet_name=sheet_name, index=False, startrow=1)
worksheet = writer.sheets[sheet_name]
worksheet.write('A1', 'Here is some additional text')
You can use openpyxl to edit your Excel file afterwards:
import contextlib
import openpyxl
import pandas as pd
new_row = "THIS ROW IS APPENDED AFTER THE FILE IS WRITTEN BY PANDAS"
with contextlib.closing(openpyxl.open(output_filename)) as wb:
for file in files:
sheet_name = file.replace(".csv", "").replace("_", " ").title()
sheet = wb[sheet_name]
sheet.insert_rows(0)
sheet["A1"] = new_row
wb.save(output_filename)
I am trying to create 3 different dataframes to output in my excel file in 3 separate worksheet called df, df_OK, df_KO. However the code below only outputs df and is not creating the other 2 dataframes df_OK, df_KO to have in the same Excel file but in 2 separate worksheets.
Any suggestions? Thanks
class blah:
def __init__(self, path, file_in, file_out):
self.path = path
self.file_in = file_in
self.file_out = file_out
def process_file(self):
df = pd.read_excel(self.path + self.file_in)
df_OK = df.loc[df['Status'] == 'OK']
df_KO = df.loc[df['Status'] == 'KO']
df_OK.loc['Total'] = df_OK[['Price']].sum(axis=0)
writer = pd.ExcelWriter(self.path + self.file_out, engine='xlsxwriter')
dfs = {
'All': df,
'OK': df_OK,
'KO': df_KO
}
for sheet_name in dfs.keys():
dfs[sheet_name].to_excel(writer, sheet_name=sheet_name, index=False)
writer.save()
b = blah('C:/Users/......./',
'path...',
'file_in....',
'file_out...')
b.process_file()
It is because you overwrite the same Excel file in every iteration of your for sheet_name in dfs.keys() loop. So every time you write an Excel file with only a single sheet to the same filename, thus overwriting the previous document.
You should move the writer.save() outside your loop like so:
for sheet_name in dfs.keys():
dfs[sheet_name].to_excel(writer, sheet_name=sheet_name, index=False)
writer.save()
I am using Ubuntu 16.0.4. After reading from an excel file, I am trying to add multiple excel sheet to a pdf file.
df = pd.read_excel(excel_name, sheet_name = 'Sheet1')
df = df.dropna(axis = 1, how='all')
df = df.dropna(how='all')
df.to_html("file.html")
pdf_name = name_of_file + '.pdf'
pdfkit.from_file("file.html", pdf_name)
How can I add another excel sheet from the same excel file to the same pdf file without overwriting the previous sheet that is in the pdf?
Thanks!
If the two sheets have the same data structure (columns and etc.):
df1 = pd.read_excel(excel_name, sheet_name = 'Sheet1')
df2 = pd.read_excel(excel_name, sheet_name = 'Sheet2')
df = df1.append(df2)
If not:
df1 = pd.read_excel(excel_name, sheet_name = 'Sheet1')
df2 = pd.read_excel(excel_name, sheet_name = 'Sheet2')
# Do whatever you need to transform the dfs
html_str = '<br />'.join([df1.to_html(), df2.to_html()])
with open("file.html", "w") as text_file:
text_file.write(html_str)
pdf_name = name_of_file + '.pdf'
pdfkit.from_file("file.html", pdf_name)
Below creates a file and then fills in the excel. I would like to create an excel file with a condition based on a single column and redeposit the excel with a prefix of the column name.
So return only where columnX = i and create and save excel file i1_CCBHC_MONTHLY_CLAIMS.XLSX
i2_CCBHC_MONTHLY_CLAIMS.XLSX
I have the build of the large "parent" excel file.
filename = 'CCBHC_Monthly_Claims.xlsx'
if os.path.isfile(filename):
wb = xw.Book(filename)
ws = wb.sheets['CCBHC_DATA']
ws.range('A1').options(index=False).value = df_ora
wb = xw.Book(filename)
xw.apps[0].quit()
else:
writer = pd.ExcelWriter(filename, engine='xlsxwriter')
*df_ora.to_excel(writer, sheet_name='CCBHC_DATA',index=False)
wb = xw.Book(filename)
ws = wb.sheets['CCBHC_DATA']
ws.range('A1').options(in*dex=False).value = df_ora
wb = xw.Book(filename)
xw.apps[0].quit()