Merging Multiple xlsx Files into one sheet - python

Currently I am trying to merge multiple excel files into one Using python. What I have so far is as follows:
sharedDocs = "C:\\SPSharedDocuments\\*.xlsx"
invoices = "C:\\SPInvoices\\*.xlsx"
formsCerts = "C:\\SPForms&Certificates\\*.xlsx"
mgmt = "C:\\SPManagement\\*.xlsx"
files = [sharedDocs, invoices, formsCerts, mgmt]
for docs in files:
excel = []
for file in glob.glob(docs):
excel.append(file)
excels = [pd.ExcelFile(name) for name in excel]
frames = [x.parse(x.sheet_names[0], header=None, index_col=None) for x in excels]
frames_new = [df[1:] for df in frames[1:]]
combined = pd.concat(frames_new)
if sharedDocs == docs:
combined.to_excel("SharedDocsMerged.xlsx", header = False, index = False)
elif invoices == docs:
combined.to_excel("InvoicesMerged.xlsx", header = False, index = False)
elif formsCerts == docs:
combined.to_excel("FormsCertsMerged.xlsx", header = False, index = False)
else:
combined.to_excel("MGMTMerged.xlsx", header = False, index = False)
This works but it does not copy the first header so that I know what the name for each column is. Before I had the line that read frames_new = [df[1:] for df in frames[1:]] as frames[1:] = [df[1:] for df in frames[1:]] but this was causing multiple copies of the same file.
All I need is it to copy one header so I know the value of each column.
Your help is much appreciated and thank you in advance.
UPDATE:
I tried using the post that put below suggesting it was a similar question and I edited my code to look like this:
sharedDocs = "C:\\SPSharedDocuments\\*.xlsx"
invoices = "C:\\SPInvoices\\*.xlsx"
formsCerts = "C:\\SPForms&Certificates\\*.xlsx"
mgmt = "C:\\SPManagement\\*.xlsx"
files = [sharedDocs, invoices, formsCerts, mgmt]
for docs in files:
excel = []
for file in glob.glob(docs):
excel.append(pd.read_excel(file))
df = pd.concat(excel, ignore_index=True, sort = True)
if sharedDocs == docs:
df.to_excel("SharedDocsMerged.xlsx", header = False, index = False)
elif invoices == docs:
df.to_excel("InvoicesMerged.xlsx", header = False, index = False)
elif formsCerts == docs:
df.to_excel("FormsCertsMerged.xlsx", header = False, index = False)
else:
df.to_excel("MGMTMerged.xlsx", header = False, index = False)
the result that I get is 2 extra columns on the left, a missing column and still no header.

Related

Combine Dataframes resulting from a for loop

I need a little help in appending the data thats getting generated out of the for loop below. Currenlty, im writing it to a dataframe in line "df = pd.DataFrame(li_row, columns=col_names)"
But when I have multiple files which starts from PAJ, I need the resulted Dataframe to be appended to one Dataframe.
Also, the below is a bits and pieces we gathered and amended to suit our need. please excuse me in case you feel its a mess. :)
import xmlschema
import os
import xml.etree.ElementTree as ET
import pandas as pd
dirpath = "C:\\Users\\xxxxx\\PycharmProjects\\pythonProject\\xmls"
filenames = os.listdir("C:\\Users\\xxxxx\\PycharmProjects\\pythonProject\\xmls")
# print(filenames)
for eachfile in filenames:
fname = eachfile[0:3]
print(dirpath+'\\'+eachfile)
if fname == 'PAJ':
xmlschema.validate(dirpath+'\\'+eachfile, 'PAJ.xsd')
tree = ET.parse(eachfile)
root = tree.getroot()
# Get AlertID from header
cols = {}
for header in root.findall(".//header/alertId"):
cols[header.tag] = header.text
# print(cols)
# get detailhr to be used for column header names
col_names = []
for DtHeader in root.findall(".//detailHdr/c"):
col_names.append(DtHeader.text)
# print(col_names)
# Get row and c
li_row = []
size = 0
for Data in root.findall(".//report/data"):
for child in Data:
# print(child.tag,child.text,len(Data))
li_row.append([])
for grandchild in child:
# print(grandchild.tag, grandchild.text,len(child))
li_row[size].append(grandchild.text)
size += 1
# print(li_row)
# create a dataframe with the col_names and row with c and alertid added at the end
df = pd.DataFrame(li_row, columns=col_names)
df['alertId'] = cols['alertId']
print(df)
elif fname == 'PIE':
fileContent = ''
with open(dirpath + '\\' + eachfile) as filehandle:
fileContent = filehandle.read()
modFileContent = fileContent.replace("UTF-16", "UTF-8")
xmlschema.validate(modFileContent, 'PIE.xsd')
So if i were to change your current solution as little as possible I create a list of paj_data_frames and concatenate them once the script was done. Look at pd.concat documentation https://pandas.pydata.org/docs/user_guide/merging.html
paj_data_frames = []
for eachfile in filenames:
....
if fname == 'PAJ':
df = pd.DataFrame(li_row, columns=col_names)
df['alertId'] = cols['alertId']
paj_data_frames.append(df)
....
final_df = pd.concat(paj_data_frames)

Read and Write multiple excel data into one excel file using openpyxl

I am trying to copy the data from multiple excel into one excel. I am novice to python and openpyxl. So i have opened each file and went row by row and copied them. I want to do this with multiple files. How do i loop through row and columns and copy the data consider the column in all the files are same order?
import openpyxl as xl
from openpyxl import workbook
incident_wb = xl.load_workbook('incident resolved yesterday.xlsx')
incident_sheet = incident_wb['Page 1']
combined_wb = xl.Workbook()
combined_sheet = combined_wb.active
combined_sheet.title = "combined_sheet"
combined_wb.save('combined_sheet.xlsx')
for row in range(1, incident_sheet.max_row+1):
incident_no = incident_sheet.cell(row,1)
opened_date = incident_sheet.cell(row,2)
shrt_desc = incident_sheet.cell(row,3)
requester = incident_sheet.cell(row,4)
incdnt_type = incident_sheet.cell(row,5)
priority = incident_sheet.cell(row,6)
assgn_grp = incident_sheet.cell(row,7)
assgn_to = incident_sheet.cell(row,8)
updated = incident_sheet.cell(row,9)
status = incident_sheet.cell(row,10)
sub_status = incident_sheet.cell(row,11)
##copy the data into the new sheet
incident_no_1 = combined_sheet.cell(row,1)
incident_no_1.value = incident_no.value
opened_date_1 = combined_sheet.cell(row,2)
opened_date_1.value = opened_date.value
shrt_desc_1 = combined_sheet.cell(row,3)
shrt_desc_1.value = shrt_desc.value
requester_1 = combined_sheet.cell(row,4)
requester_1.value = requester.value
incdnt_type_1 = combined_sheet.cell(row,5)
incdnt_type_1.value = incdnt_type.value
priority_1 = combined_sheet.cell(row,6)
priority_1.value = priority.value
assgn_grp_1 = combined_sheet.cell(row,7)
assgn_grp_1.value = assgn_grp.value
assgn_to_1 = combined_sheet.cell(row,8)
assgn_to_1.value = assgn_to.value
updated_1 = combined_sheet.cell(row,9)
updated_1.value = updated.value
status_1 = combined_sheet.cell(row,10)
status_1.value = status.value
sub_status_1 = combined_sheet.cell(row,11)
sub_status_1.value = sub_status.value
##print(f"The incident resolved yesterday {incident_no.value}")
combined_wb.save('combined_sheet.xlsx')
An alternative approach would be to build a list of date from multiple excel files and then write it to another file.
As a proof of concept:
import openpyxl as xl
from openpyxl import workbook
def provide_data(workbookName, sheetName):
wb = xl.load_workbook(workbookName)
sheet = wb[sheetName]
return [[y.value for y in x] for x in sheet.iter_rows()]
# This creates an array of rows, which contain an array of cell values.
# It will be much better to provide mapping for cells and return business object.
def save_data(list_of_sheets):
combined_wb = xl.Workbook()
combined_sheet = combined_wb.active
combined_sheet.title = "combined_sheet"
for sheet in list_of_sheets:
for row in sheet:
combined_sheet.append(row) # combining multiple rows.
combined_wb.save('combined_sheet.xlsx')
workSheetsToCopy = [['incident resolved yesterday.xlsx', 'Page 1'], ['other.xlsx', 'Page 1']]
workSheetsToCopy = [provide_data(x[0], x[1]) for x in workSheetsToCopy]
save_data(workSheetsToCopy)

Read multiple file in python and generate one output

I have a python script for generating 1 upload file from 1 input file.
The thing is that the input files have started coming in batches, 30-50 at one time.
e.g.:
1111.xlsx --> upload.xlsx
1125.xlsx --> upload.xlsx
1176.xlsx --> upload.xlsx
1322.xlsx --> upload.xlsx
The code just converting the input files in the upload format.
Here's what I have done so far (1 input file -> 1 output file):
def main():
initial_workbook = 'C:/files/1111.xlsx'
temp_df = pd.ExcelFile(initial_workbook)
initial_df = pd.read_excel(initial_workbook, sheet_name = "default")
#drop first 4 rows to set header
new_header = initial_df.iloc[2]
initial_df = initial_df.iloc[3:]
initial_df.columns = new_header
#drop all rows with no data
indexNames = initial_df[initial_df['grade'] == 'select'].index
initial_df.drop(indexNames , inplace=True)
initial_df.dropna(axis=1, how='all')
output = initial_df.to_excel('C:/files/upload_file.xlsx', index = False)
Is there a way to generate one upload file for all the files from the input folder. And once the files input files have been processed, rename them by prefixing x in front of it. e.g. x1111.xlsx
So here is how I will approach, for a given batch:
from datetime import datetime
import os
from pathlib import Path
all_dfs = []
proj_path = Path("C:/files/")
for f in os.listdir(proj_path):
if f.endswith(".xlsx"):
print(f"processing {f}...")
df_tmp = main(proj_path / f)
df_tmp["file_name"] = f
all_dfs.append(df_tmp)
df_all = pd.concat(all_dfs, axis=0)
df_all.to_excel(proj_path / f"{datetime.now()}_batch.xlsx", index = False)
def main(f):
initial_workbook = proj_path / f
temp_df = pd.ExcelFile(initial_workbook)
initial_df = pd.read_excel(initial_workbook, sheet_name = "default")
#drop first 4 rows to set header
new_header = initial_df.iloc[2]
initial_df = initial_df.iloc[3:]
initial_df.columns = new_header
#drop all rows with no data
indexNames = initial_df[initial_df['grade'] == 'select'].index
initial_df.drop(indexNames, inplace=True)
initial_df.dropna(axis=1, how='all', inplace=True)
return initial_df
You can potentially enclose the logic for a batch in a function.

check if the csv file exists and do the condition?

Hi I am working on csv file and I have a data I want to append these data to the csv file. But firstly I want to check if the csv file exists if TRUE then just open the csv file and append the data to csv file and save it, if NOT just create a DataFrame and with these data and save it.
Note: I have a csv file in my I want to append the sample of data to my csv file
thanks in advance.
here is my trying.
#sample of data
ID = 5
img_Latitude = 38786454
img_Longitude = 1118468
meta_lat = 45778
meta_long = 886556
#create a function
def create_csv( ID, img_Latitude, img_Longitude,meta_lat, meta_long):
#check if the file is exists, if True
if os.path.isfile('C:/My/Path/compare_coordinates.csv'):
#read the csv file
df = pd.read_csv('compare_coordinates.csv')
#make pd.series
data = pd.Series([ID, img_Latitude, img_Longitude, meta_lat, meta_long],
index=['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long'])
#append the data to df
df.append(data, ignore_index=True)
else:
data = [ID, img_Latitude, img_Longitude, meta_lat, meta_long]
columns = ['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long']
df = pd.DataFrame(data, columns).T
df.to_csv('C:/My/Path/compare_coordinates.csv', index=False)
The line df.append(data, ignore_index = True) needs to be:
df = df.append(data, ignore_index = True)
This is because DatFrame.append returns a new DF with the appended lines, it does not append in-place:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.append.html
To get the values that needed must be saved in variable so for the line
df.append(data, ignore_index = True) to be edited to df = df.append(data, ignore_index = True) and for the getting value of file exists or not as following codes:
def create_csv( ID, img_Latitude, img_Longitude,meta_lat, meta_long):
Path = os.path.isfile('My/path/compare_coordinates1.csv')
if Path==True:
df = pd.read_csv('compare_coordinates1.csv')
data = pd.Series([ID, img_Latitude, img_Longitude, meta_lat, meta_long],
index=['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long'])
df = df.append(data, ignore_index=True)
else:
data = [ID, img_Latitude, img_Longitude, meta_lat, meta_long]
columns = ['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long']
df = pd.DataFrame(data, columns).T
df.to_csv('My/path/compare_coordinates1.csv', index=False)

Separate Python web scraped data in different columns

I tried to scrape data by using API and put those result in an CSV file. But when I open my CSV file all the data is put together in 1 column(A). Instead I want the data to be separated in different columns(A & B (and C, D, E, F etc when I want to add info)). How can I do that?
import requests
import pandas as pd
from pandas.compat import StringIO
import numpy as np
import datetime as dt
from dateutil.relativedelta import relativedelta
import csv
csv_file = open('/Users/katewang/Desktop/Test/scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
def get_EOD_data(api_token='5cb671b0b4a790.35526238', session = None, tickers = 'AAPL', start_date = dt.datetime(2018,1,1), end_date = dt.datetime(2018,12,31)):
symbols = tickers
if session is None:
session = requests.Session()
url = 'https://eodhistoricaldata.com/api/eod/%s.US' % symbols
params = {"api_token": api_token, "from": start_date, "to": end_date}
r = session.get(url, params = params)
if r.status_code == requests.codes.ok:
cols=[0,5]
df = pd.read_csv(StringIO(r.text), skipfooter = 1, parse_dates = [0], engine = 'python', na_values=['nan'], index_col = 0, usecols = cols)
df.fillna(method = 'ffill', inplace = True)
df.fillna(method = 'bfill', inplace = True)
return df
def main():
df_data = get_EOD_data()
csv_writer.writerow([df_data])
if __name__ == '__main__':
main()
csv_file.close()
I expect to see two separate columns.
You're seeing only one column since, out of the two selected columns 0 and 5, you set column 0 to be the index when creating the dataframe. This leaves only column 5 as an actual column.
You can check for yourself by removing index_col = 0 from the line
df = pd.read_csv(StringIO(r.text), skipfooter = 1, parse_dates = [0], engine = 'python', na_values=['nan'], index_col = 0, usecols = cols)

Categories

Resources