Merge Multiple Excel files having multiple sheets to One Excel file, - python

I am trying to do this
Multiple Excel files having multiple sheets to One Excel file, having merged data in multiple sheets.
if all files have sheets "A",B and C, data from all Sheets should concate in a single file under the same sheet names. I wrote the below code but I am failing; I get the error 'NoneType' object has no attribute 'to_excel'
import pandas as pd
#return all file paths that match a specific pattern in our case we want all *.xlsx
import glob
import os
import openpyxl
import xlrd
#reading excel files folder
location = r"C:\-----Desktop\python\Input/*.xlsx"
excel_files = glob.glob(location)
with pd.ExcelWriter(r"C:\---on\Output\filemergetest.xlsx") as writer: # excel writing for multple sheets
for files in excel_files:
sheet = os.path.basename(files) #simplying file name
sheet = sheet.split(".")[0] #simplying file name
list_of_dfs = []
# Iterate through each worksheet
for sheet in excel_files:
fi = pd.ExcelFile(sheet)
for sh in fi.sheet_names:
# Parse data from each worksheet as a Pandas DataFrame
dfC = fi.parse(sheet_name=0)
# And append it to the list
data2 = list_of_dfs.append(dfC)
# Combine all DataFrames into one
#data2 = pd.concat(list_of_dfs,ignore_index=True)
# Preview first 10 rows
data2
data2.to_excel(r"C:----python\Output\jointsheetsTAA.xlsx",index=False)

Related

pandas loop through excel files and sheets

Need help please.
Using python 3.
I need to loop through a folder that contains excel files and each file has multiple sheets.
How do I loop through all the files and all the sheets and extract to a dataframe?
What I was able to accomplish only returns one excel file and all the worksheets for that file but I need for all files. Please help.
This is what I have so far:
from xlsxwriter import Workbook
import pandas as pd
import openpyxl
import glob
import os
path = 'filestoimport/*.xlsx'
for filepath in glob.glob(path):
xl = pd.ExcelFile(filepath)
# Define an empty list to store individual DataFrames
list_of_dfs = []
list_of_dferror= []
for sheet_name in xl.sheet_names:
df = xl.parse(sheet_name, usecols='A,D,N,B,C,E,F,G,H,I,J,K,L,M', header=0)
df.columns = df.columns.str.replace(' ', '')
df['sheetname'] = sheet_name # this adds `sheet_name` into the column
# using basename function from os
# module to print file name
file_name = os.path.basename(filepath)
df['sourcefilename'] = file_name
# only add sheets containing columns ['Status', 'ProjectID']
column_names = ['Status', 'ProjectID']
if set(column_names).issubset(df.columns):
df['Status'].fillna('', inplace=True)
df['Addedby'].fillna('', inplace=True)
# And append it to the list
list_of_dfs.append(df)
# Combine all DataFrames into one
data = pd.concat(list_of_dfs, ignore_index=True)

Extracting multiple excel files as Pandas data frame

I'm trying to create a data ingestion routine to load data from multiple excel files with multiple tabs and columns in the pandas data frame. The structuring of the tabs in each of the excel files is the same. Any help would be appreciated!!
folder = "specified_path"
files = os.listdir(folder)
sheet_contents = {}
for file in files:
data = pd.ExcelFile(folder+file)
file_data = {}
for sheet in data.sheet_names:
file_data[sheet] = data.parse(sheet)
sheet_contents[file[:-5]] = file_data
One of the ways to create a dataframe for each excelfile (stored in a specific folder and that holds multiple sheets) is by using pandas.read_excel and pandas.concat combined. By passing the parameter sheet_name=None to pandas.read_excel, we can read in all the sheets in the excelfile at one time.
Try this :
import os
import pandas as pd
folder = 'specified_path'
excel_files = [file for file in os.listdir(folder)]
list_of_dfs = []
for file in excel_files :
df = pd.concat(pd.read_excel(folder + "\\" + file, sheet_name=None), ignore_index=True)
df['excelfile_name'] = file.split('.')[0]
list_of_dfs.append(df)
To access to one of the dataframes created, you can use its index (e.g, list_of_dfs[0]) :
print(type(list_of_dfs[0]))
<class 'pandas.core.frame.DataFrame'>

Python, how to combine different excel workbooks into one excel workbook as sheets

is there any way in python by which we can combine different excel workbooks into one excel workbook having sheets containing data of those different excel workbooks?
For example lets say I am having two excel workbooks 1) emp.xlsx and 2) dept.xlsx i want output as output.xlsx (having worksheets as emp and dept with data of emp.xlsx and dept.xlsx). Request you to please share your thoughts on this.
Regards
Kawaljeet
What you need to do is get each sheet one by one and then create an excel with each one of those sheets. You can use the file name to name the new sheets as in emp-sheet1, emp-sheet2, dept-sheet1, and so on.
The nest example assumes you have two excel files named emp.xlsx and dept.xlsx and generates a new output.xlsx file containing all the sheets and values:
#!pip install openpyxl
#!pip install xlrd
import pandas as pd
def get_sheets(filenames):
'''
This function generates dataframes from excel sheets.
Returns:
- dfs: a list of dataframes one for each sheet
- sheets: combined names for the new sheets filename+-+sheetname
'''
sheets = []
dfs = []
for file in filenames:
xl = pd.ExcelFile(file)
sheet_names = xl.sheet_names
for sheet in sheet_names:
dfs.append(xl.parse(sheet, header=None))
sheets.append(file.split('.')[0]+'-'+sheet)
return dfs, sheets
def save_xls(dfs, sheets, xls_path):
'''
Saves each dataframe in dfs as a sheet with the name in sheets
into the file specified in xls_path
'''
with pd.ExcelWriter(xls_path) as writer:
for n, df in enumerate(dfs):
df.to_excel(writer, sheets[n], index = False, header = None)
writer.save()
filenames = ['emp.xlsx', 'dept.xlsx']
dfs, sheets = get_sheets(filenames)
save_xls(dfs, sheets, 'output.xlsx')

Write CSV content to Excel produces empty sheets

Writing DataFrame to excel file leaves sheets with zero data.
I am creating a Robotics "Scouting application". It receives multiple .csv files throughout the coarse of two days. The csv files will be named with a four digit number plus a hyphen and then a match number. For example "2073-18.csv". Multiple files for each team will arrive. I need one sheet for each team with the content of each csv file on the same sheet for that team. Creating the sheets works, writing the data to these sheets doesn't.
import os
import glob
import csv
from xlsxwriter.workbook import Workbook
import pandas as pd
import numpy as np
#from sqlalchemy import create_engine
from openpyxl import load_workbook
os.chdir ("/EagleScout")
path = '.'
extension = 'csv'
engine = 'xlsxwriter'
files_in_dir = [ f for f in glob.glob('*.csv')]
workbook = Workbook('Tournament.xlsx')
with pd.ExcelWriter('Tournament.xlsx') as writer:
for csvfile in files_in_dir:
df = pd.read_csv(csvfile)
fName, fExt = (os.path.splitext(csvfile))
sName = fName.split('-')
worksheet = workbook.get_worksheet_by_name(sName [0])
if worksheet is None:
worksheet = workbook.add_worksheet(sName [0]) #workseet with csv file name
df.to_excel(writer, sheet_name = (sName[0]))
writer.save()
workbook.close()
What I need is one workbook with one sheet for each team, up to 70 teams. Each sheet will have multiple rows, one for each csv file that arrived for that team. The question is, how do I get Pandas, or other libraries, to write the content of each csv file to it's appropriate sheet in the workbook?
OK, with the input from #ivan_pozdeev, I finally got past my issues.
Remember, my original desire was to generate a script that could be run on a regular basis and generate a spreadsheet with multiple worksheets. Each worksheet would contain all the data from the .csv files for every match that had played, and grouped by the team number.
I have also added a single spreadsheet that contains the raw data.
Here is what I came up with:
import os
import glob
import csv
import xlsxwriter
from xlsxwriter.workbook import Workbook
import pandas as pd
import numpy as np
#from sqlalchemy import create_engine
#import openpyxl
#from openpyxl import load_workbook
os.chdir ("/EagleScout")
path = '.'
extension = 'csv'
# Remove the combined .csv file from previous runs
#This will provide clean date without corruption from earlier runs
if os.path.exists('./Spreadsheets/combined.csv'):
os.remove ('./Spreadsheets/combined.csv')
#Remove previous Excel spreadsheet
if os.path.exists('./Spreadsheets/Tournament.xlsx'):
os.remove ('./Spreadsheets/Tournament.xlsx')
#Remove sorted combined csv
#Remove previous Excel spreadsheet
if os.path.exists('./Spreadsheets/Combined.xlsx'):
os.remove ('./Spreadsheets/Combined.xlsx')
#Read in and merge all .CSV file names
files_in_dir = [ f for f in glob.glob('*.csv')]
#Create a single combined .csv file with all data
#from all matches completed so far.
d1 = pd.read_csv('Header.txt')
d1.to_csv('./Spreadsheets/combined.csv', header = True, index = False)
for filenames in files_in_dir:
df = pd.read_csv(filenames)
fName, fExt = (os.path.splitext(filenames))
sName = fName.split('-')
N=(sName[1])
df.insert(0,N,N,True)
df.to_csv('./Spreadsheets/combined.csv', index_label = (sName[0]), mode = 'a')
#Combine all csv files into one master Raw Excel Data file
#and add column headers as labels
with pd.ExcelWriter('./Spreadsheets/Combined.xlsx') as writer:
dt = pd.read_csv('./Spreadsheets/combined.csv')
dt.to_excel(writer, sheet_name = 'All data')
writer.save()
#Parse through all .CSV files and append content to appropriate team worksheet.
with pd.ExcelWriter('./Spreadsheets/Tournament.xlsx') as writer:
df2 = pd.read_excel('./Spreadsheets/Combined.xlsx')
group = df2.groupby('Team')
for Team, Team_df in group:
Team_df.to_excel(writer, sheet_name = str(Team))
writer.save()
I am certain there is a cleaner way to do this code, I'm still new at this, but for now it does what I expect.

Read excel files from a folder, extract a cell from sheet-1 and append values to a new excel sheet

I have around 100 excel files in a folder. I need to extract a cell, say name column D6 from the sheet-1 of the excel files and output the same to a new excel file/sheet. I have a followed a few SO questions but have not been able to find the desired output. This is what my issue is when I run the below program`
TypeError: cannot concatenate a non-NDFrame object
`
import os
import pandas as pd
import xlrd
import xlwt
files = os.listdir(path)
files
all_data = pd.DataFrame()
for file in files:
wb = xlrd.open_workbook(file)
sheet = wb.sheet_by_index(0)
df = sheet.cell_value(5,3)
all_data.append(df,ignore_index=True)
writer = pd.ExcelWriter('output.xlsx', engine='xlsxwriter')
all_data.to_excel(writer,'sheet1')
writer.save()
Your error says that you can only concat a dataframe with another dataframe. when you read the cell with xlrd you don't get a df-object. so either make the single cell a dataframe or store it temorarily and make the dataframe afterwards.
something like this (untested) should do it.
all_data = [] # list
for file in files:
df = pd.read_excel(file, sheetname='sheet-1')
all_data.append(df.iloc[5,3])
all_data = pd.DataFrame(all_data) # dataframe
all_data.to_excel('all_data.xlsx')
or one could use other libraries as well to make the same, like openpyxl for example.

Categories

Resources