I am using Ubuntu 16.0.4. After reading from an excel file, I am trying to add multiple excel sheet to a pdf file.
df = pd.read_excel(excel_name, sheet_name = 'Sheet1')
df = df.dropna(axis = 1, how='all')
df = df.dropna(how='all')
df.to_html("file.html")
pdf_name = name_of_file + '.pdf'
pdfkit.from_file("file.html", pdf_name)
How can I add another excel sheet from the same excel file to the same pdf file without overwriting the previous sheet that is in the pdf?
Thanks!
If the two sheets have the same data structure (columns and etc.):
df1 = pd.read_excel(excel_name, sheet_name = 'Sheet1')
df2 = pd.read_excel(excel_name, sheet_name = 'Sheet2')
df = df1.append(df2)
If not:
df1 = pd.read_excel(excel_name, sheet_name = 'Sheet1')
df2 = pd.read_excel(excel_name, sheet_name = 'Sheet2')
# Do whatever you need to transform the dfs
html_str = '<br />'.join([df1.to_html(), df2.to_html()])
with open("file.html", "w") as text_file:
text_file.write(html_str)
pdf_name = name_of_file + '.pdf'
pdfkit.from_file("file.html", pdf_name)
Related
I am using pandas library to combine excel sheets into bytesIO memory, from different files, and write them to another combined sheets excel file as follows:
# Bucket
output = io.BytesIO()
bucket = "MYBUCKET"
filepath = "test.xlsx"
# dir
current_dir = os.getcwd()
root_path = current_dir.replace("/src", "")
# logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s -> [%(levelname)s]: %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
def get_sheet_names(path):
# Returns all sheet names
xls = pd.ExcelFile(path)
return xls.sheet_names
def add_sheets(metadata_path, sample_path):
filename = sample_path.split("/")[-1]
filename_slugify = f"_metadata_{filename.lower()}_{date.today()}.xlsx"
# create a Pandas Excel writer using XlsxWriter as the engine
logger.info(f"root path: {root_path}")
output_path = f"{root_path}/{filename_slugify}"
writer = pd.ExcelWriter(output_path, engine='xlsxwriter')
# Metadata
metadata_sheets = get_sheet_names(metadata_path)
logger.info(f"Metadata sheets: {metadata_sheets}")
for i, sheet in enumerate(metadata_sheets):
logger.info(f"Sheet number '{i+1}' is in progress: {sheet}")
df = pd.read_excel(metadata_path, sheet_name=i, header=None) # Create dataframe from sheet
# write each DataFrame to a specific sheet
df.to_excel(writer, sheet_name=sheet, index=False, header=False)
# Sample data
sample_sheets = get_sheet_names(sample_path)
logger.info(f"Sample data sheets: {sample_sheets}")
for i, sheet in enumerate(sample_sheets):
logger.info(f"Sheet number '{i+1}' is in progress: {sheet}")
df = pd.read_excel(sample_path, sheet_name=i, header=None) # Create dataframe from sheet
# write each DataFrame to a specific sheet
df.to_excel(writer, sheet_name=sheet, index=False, header=False)
logger.info(f'{output_path} \n Exporting {filename_slugify}...')
writer.save()
output.seek(0)
logger.info(f"Adding level value for '{filename}' has finished.")
data = output.getvalue()
s3 = boto3.resource('s3')
s3.Bucket(bucket).put_object(Key=filepath, Body=data)
if __name__ == "__main__":
add_sheets("s3://" + "MYBUCKET" + f"METADATAFILE_PATH",
"s3://" + "MYBUCKET" + f"SAMPLEDATAFILE_PATH")
The output file comes empty, with no data or sheets, when debugging the dataframes, I see the read operations are working well, the writer is the one who missing somthing.
How can I append a row at the top of an excel sheet? Goal as follows:
The file itself is written by using pandas.df.to_excel as follows:
import pandas
with pandas.ExcelWriter(output_filename) as writer:
for file in files:
df = pandas.read_csv(file)
df.to_excel(writer, sheet_name=file.replace(".csv", "").replace("_", " ").title(), index=False)
Here is one way to do it using XlsxWriter as the Excel engine:
with pandas.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
for file in files:
df = pandas.read_csv(file)
sheet_name = file.replace(".csv", "").replace("_", " ").title()
df.to_excel(writer, sheet_name=sheet_name, index=False, startrow=1)
worksheet = writer.sheets[sheet_name]
worksheet.write('A1', 'Here is some additional text')
You can use openpyxl to edit your Excel file afterwards:
import contextlib
import openpyxl
import pandas as pd
new_row = "THIS ROW IS APPENDED AFTER THE FILE IS WRITTEN BY PANDAS"
with contextlib.closing(openpyxl.open(output_filename)) as wb:
for file in files:
sheet_name = file.replace(".csv", "").replace("_", " ").title()
sheet = wb[sheet_name]
sheet.insert_rows(0)
sheet["A1"] = new_row
wb.save(output_filename)
I am trying to create 3 different dataframes to output in my excel file in 3 separate worksheet called df, df_OK, df_KO. However the code below only outputs df and is not creating the other 2 dataframes df_OK, df_KO to have in the same Excel file but in 2 separate worksheets.
Any suggestions? Thanks
class blah:
def __init__(self, path, file_in, file_out):
self.path = path
self.file_in = file_in
self.file_out = file_out
def process_file(self):
df = pd.read_excel(self.path + self.file_in)
df_OK = df.loc[df['Status'] == 'OK']
df_KO = df.loc[df['Status'] == 'KO']
df_OK.loc['Total'] = df_OK[['Price']].sum(axis=0)
writer = pd.ExcelWriter(self.path + self.file_out, engine='xlsxwriter')
dfs = {
'All': df,
'OK': df_OK,
'KO': df_KO
}
for sheet_name in dfs.keys():
dfs[sheet_name].to_excel(writer, sheet_name=sheet_name, index=False)
writer.save()
b = blah('C:/Users/......./',
'path...',
'file_in....',
'file_out...')
b.process_file()
It is because you overwrite the same Excel file in every iteration of your for sheet_name in dfs.keys() loop. So every time you write an Excel file with only a single sheet to the same filename, thus overwriting the previous document.
You should move the writer.save() outside your loop like so:
for sheet_name in dfs.keys():
dfs[sheet_name].to_excel(writer, sheet_name=sheet_name, index=False)
writer.save()
Say I have a folder folder1 with excel files, their filenames share same structures: city, building name and id, I want save them in dataframe and then excel file. Please note I also need to append other folders' excel filenames in result.
bj-LG center-101012.xlsx
sh-ABC tower-1010686.xlsx
bj-Jinzhou tower-101018.xlsx
gz-Zijin building-101012.xls
...
The first method I have tried:
import os
import pandas as pd
from pandas import DataFrame, ExcelWriter
path = os.getcwd()
file = [".".join(f.split(".")[:-1]) for f in os.listdir() if os.path.isfile(f)] #exclude files' extension
city = file.split('-')[0]
projectName = file.split('-')[1]
projectID = file.split('-')[2]
#print(city)
df = pd.DataFrame(columns = ['city', 'building name', 'id'])
df['city'] = city
df['building name'] = projectName
df['id'] = projectID
writer = pd.ExcelWriter("C:/Users/User/Desktop/test.xlsx", engine='xlsxwriter')
df.to_excel(writer, index = False)
writer.save()
Problem:
Traceback (most recent call last):
File "<ipython-input-203-c09878296e72>", line 9, in <module>
city = file.split('-')[0]
AttributeError: 'list' object has no attribute 'split'
My second method:
for root, directories, files in os.walk(path):
#print(root)
for file in files:
if file.endswith('.xlsx') or file.endswith('.xls'):
#print(file)
city = file.split('-')[0]
projectName = file.split('-')[1]
projectID = file.split('-')[2]
#print(city)
df = pd.DataFrame(columns = ['city', 'building name', 'id'])
df['city'] = city
df['building name'] = projectName
df['id'] = projectID
writer = pd.ExcelWriter("C:/Users/User/Desktop/test.xlsx", engine='xlsxwriter')
df.to_excel(writer, index = False)
writer.save()
I got an empty test.xlsx file, how could I make it works? Thanks.
This splits off the file extension, then unpacks the split into the vairables.
Creates a dictionary then appends the dictionary to the dataframe.
files = [
"bj-LG center-101012.xlsx",
"sh-ABC tower-1010686.xlsx",
"bj-Jinzhou tower-101018.xlsx",
"gz-Zijin building-101012.xls"]
df = pd.DataFrame()
for file in files:
filename = file.split(".")[0]
city, projectName, projectID = filename.split("-")
d = {'city':city,'projectID':projectID,'projectName':projectName}
df = df.append(d,ignore_index=True)
df.to_excel('summary.xlsx')
Method 2 is close.
You need to create the dataframe before the for loops. After your variable assignments, make a dictionary of the variables and append it to the dataframe.
There is also probably a better way to find your file list using glob, but i will just work with what you have already done.
df = pd.DataFrame()
for root, directories, files in os.walk(path):
for file in files:
if file.endswith('.xlsx') or file.endswith('.xls'):
#print(file)
city = file.split('-')[0]
projectName = file.split('-')[1]
projectID = file.split('-')[2]
#append data inside inner loop
d = {'city':city, 'building name':projectname, 'id':projectID}
df.append(d)
writer = pd.ExcelWriter("C:/Users/User/Desktop/test.xlsx", engine='xlsxwriter')
df.to_excel(writer, index = False)
writer.save()
This should works, thanks to the hint of use glob from #Dan Wisner
import os
from glob import glob
fileNames = [os.path.splitext(val)[0] for val in glob('*.xlsx') or glob('*.xls')]
df = pd.DataFrame({'fileNames': fileNames})
df[['city', 'name', 'id']] = df['fileNames'].str.split('-', n=2, expand=True)
del df['fileNames']
writer = pd.ExcelWriter("C:/Users/User/Desktop/test1.xlsx", engine='xlsxwriter')
df.to_excel(writer, index = False)
writer.save()
Each folder has a csv for each month of the year(1.csv,2.csv,3.csv etc) and the script creates a dataframe combining the 9th column for all 12 csv's into an xlsx sheet named concentrated.xlsx. It works but only for one directory at a time
files = glob['2014/*.csv']
sorted_files = natsorted(files)
def read_9th(fn):
return pd.read_csv(fn, usecols=[9], names=headers)
big_df = pd.concat([read_9th(fn) for fn in sorted_files], axis=1)
writer = pd.ExcelWriter('concentrated.xlsx', engine='openpyxl')
big_df.to_excel(writer,'2014')
writer.save()
Is it possible to create a dataframe automatically for each directory without having to manually create one for each folder like this:
files14 = glob['2014/*.csv']
files15 = glob['2015/*.csv']
sorted_files14 = natsorted(files14)
sorted_files15 = natsorted(files15)
def read_9th(fn):
return pd.read_csv(fn, usecols=[9], names=headers)
big_df = pd.concat([read_9th(fn) for fn in sorted_files14], axis=1)
big_df1 = pd.concat([read_9th(fn) for fn in sorted_files15], axis=1)
writer = pd.ExcelWriter('concentrated.xlsx', engine='openpyxl')
big_df.to_excel(writer,'2014')
big_df1.to_excel(writer,'2015')
writer.save()
If you get a list of the folders that want to process, e.g.
folders = os.listdir('.')
# or
folders = ['2014', '2015', '2016']
You could do something like:
writer = pd.ExcelWriter('concentrated.xlsx', engine='openpyxl')
for folder in folders:
files = glob('%s/*.csv' % folder)
sorted_files = natsorted(files)
big_df = pd.concat([read_9th(fn) for fn in sorted_files], axis=1)
big_df.to_excel(writer, folder)
writer.save()