pandas loop through excel files and sheets - python

Need help please.
Using python 3.
I need to loop through a folder that contains excel files and each file has multiple sheets.
How do I loop through all the files and all the sheets and extract to a dataframe?
What I was able to accomplish only returns one excel file and all the worksheets for that file but I need for all files. Please help.
This is what I have so far:
from xlsxwriter import Workbook
import pandas as pd
import openpyxl
import glob
import os
path = 'filestoimport/*.xlsx'
for filepath in glob.glob(path):
xl = pd.ExcelFile(filepath)
# Define an empty list to store individual DataFrames
list_of_dfs = []
list_of_dferror= []
for sheet_name in xl.sheet_names:
df = xl.parse(sheet_name, usecols='A,D,N,B,C,E,F,G,H,I,J,K,L,M', header=0)
df.columns = df.columns.str.replace(' ', '')
df['sheetname'] = sheet_name # this adds `sheet_name` into the column
# using basename function from os
# module to print file name
file_name = os.path.basename(filepath)
df['sourcefilename'] = file_name
# only add sheets containing columns ['Status', 'ProjectID']
column_names = ['Status', 'ProjectID']
if set(column_names).issubset(df.columns):
df['Status'].fillna('', inplace=True)
df['Addedby'].fillna('', inplace=True)
# And append it to the list
list_of_dfs.append(df)
# Combine all DataFrames into one
data = pd.concat(list_of_dfs, ignore_index=True)

Related

Python Pandas csv files to Excel worksheets - Cleanup

I want to take multiple .csv files and convert them to Excel worksheets in one workbook, specifically using Pandas.
I finally got this to work, but I know the code itself is of poorly written.
Any suggestions on how to clean this up?
"Beautify is better than Ugly"
Here is the code:
import pandas as pd
import os
import openpyxl as xl
directory = os.path.join(os.curdir, "data/")
new_xl_file_path = "csv_merge.xlsx"
new_xl_file = xl.Workbook() # Create a new Excel workbook
new_xl_file.save(new_xl_file_path)
name_list = os.listdir(directory) # file1.csv, file2.csv, file3.csv, etc...
full_path_list = [] # For reading with pd.read_csv()
data_frame_list = [] # List to save .csv dataframes
for filename in os.listdir(directory):
f = os.path.join(directory, filename) # Get full path name
df = pd.read_csv(f)
data_frame_list.append(df)
counter = 0
with pd.ExcelWriter(new_xl_file_path) as writer:
for dataframe in data_frame_list:
dataframe.to_excel(writer, index=False, sheet_name=name_list[counter])
counter += 1

Grabbing a single Excel worksheet from multiple workbooks into a pandas dataframe and saving this

I need to extract an Excel worksheet from multiple workbooks and saving it to a dataframe and in turn saving that dataframe.
I have a spreadsheet that is generated at the end of each month (e.g.
June 2019.xlsx, May 2019.xlsx, April 2019.xlsx).
I need to grab a worksheet 'Sheet1'from each of these workbooks and convert these to a dataframe (df1).
I would like to have this dataframe saved.
As a nice to have, I would also like some way just to append the next month's data after the initial 'data grab'.
I'm relatively new to this, so I haven't made much progress.
import os
import glob
import pandas as pd
import xlrd
import json
import io
import flatten_json
files = glob.glob('/Users/ngove/Documents/Python Scripts/2019/*.xlsx')
dfs={}
for f in files:
dfs[os.path.splitext(os.path.basename(f))[0]] = pd.read_excel(f)
You can drop all of your files in a directory (e.g. current directory). Then append all of your excel files in a list (e.g. files_xls). Iterate over all your files and use pandas.read_excel to get the respective dataframes (e.g. list_frames).
Below, you can find an example:
import os
import pandas as pd
path = os.getcwd() # get cur dir
files = os.listdir(path) # get all the files in your cur dir
# get only the xls or xlsm (this depends on you)
files_xls = [f for f in files if (f[-3:] == 'xls' or f[-4:] == 'xlsm')]
df = pd.DataFrame()
list_frames = []
for f in files_xls:
print("Processing file: %s" %f)
try:
# the following will give you the dataframe
# the fun params depends on your data format
data = pd.read_excel(f, 'Sheet1', header=0, index_col=None,
sep='delimiter', error_bad_lines=False,
skip_blank_lines=True, comment=',,')
except:
pass
list_frames.append(data)
# at the end you can concat your data if you want and remove any dublicate
df = pd.concat(list_frames, sort=False).fillna(0)
df = df.drop_duplicates()
# at the end you can save it
writer = pd.ExcelWriter("your_title" + ".xlsx", engine='xlsxwriter')
df.to_excel(writer, sheet_name="Sheets1", index=False)
writer.save()
I hope this helps.
I interpreted your statement that you want to save the dataframe as that you want to save it as a combined Excel file. This will combine all files in the folder specified that end in xlsx.
import os
import pandas as pd
from pandas import ExcelWriter
os.chdir("H:/Python/Reports/") #edit this to be your path
path = os.getcwd()
files = os.listdir(path)
files_xlsx = [f for f in files if f[-4:] == 'xlsx']
df = pd.DataFrame()
for f in files_xlsx:
data = pd.read_excel(f, 'Sheet1')
df = df.append(data)
writer=ExcelWriter('Combined_Data.xlsx')
df.to_excel(writer,'Sheet1',index=False)
writer.save()
You could update the code to grab all 2019 files by changing the one line to this:
files_xlsx = [f for f in files if f[-9:] == '2019.xlsx']
I referenced this question for most of the code and updated for xlsx and added the file save portion of the code

Iterate through excel files and sheets and concatenate in Python

Say I have a folder which have multiple excel files with extension xlsx or xls, they share same header column a, b, c, d, e except some empty sheet in several files.
I want to iterate all the files and sheets (except for empty sheets) and concatenate them into one sheet of one file output.xlsx.
I have iterated through all excel files and append them to one file, but how could I iterate through all the sheets of each files if they have more than one sheets?
I need to integrate two block of code below into one. Thanks for your help.
import pandas as pd
import numpy as np
import glob
path = os.getcwd()
files = os.listdir(path)
files
df = pd.DataFrame()
# method 1
excel_files = [f for f in files if f[-4:] == 'xlsx' or f[-3:] == 'xls']
excel_files
for f in excel_files:
data = pd.read_excel(f)
df = df.append(data)
# method 2
for f in glob.glob("*.xlsx" or "*.xls"):
data = pd.read_excel(f)
df = df.append(data, ignore_index=True)
# save the data frame
writer = pd.ExcelWriter('output.xlsx')
df.to_excel(writer, 'sheet1')
writer.save()
For one file to concatenate multiple sheets:
file = pd.ExcelFile('file.xlsx')
names = file.sheet_names # read all sheet names
df = pd.concat([file.parse(name) for name in names])
import pandas as pd
path = os.getcwd()
files = os.listdir(path)
files
excel_files = [file for file in files if '.xls' in file]
excel_files
def create_df_from_excel(file_name):
file = pd.ExcelFile(file_name)
names = file.sheet_names
return pd.concat([file.parse(name) for name in names])
df = pd.concat(
[create_df_from_excel(xl) for xl in excel_files]
)
# save the data frame
writer = pd.ExcelWriter('output.xlsx')
df.to_excel(writer, 'sheet1')
writer.save()

How to split a large excel file into multiple worksheets based on their given ip address using pandas python

I'm new to pandas and python so ran into some trouble. I have a one large excel file which i need to divide into multiple worksheets using a python script. That i have to divide base on the ip addresses given in the data. I can't figure out how to do that and would appreciate some help and guidance.
I have no knowledge of working with python or any libraries before. This is what i did but created workbooks for each row.
import pandas as pd
df = pd.read_excel("D:/Users/Zakir/Desktop/MyNotebooks/Legacy.xls", sheet_name="Total", header=0, names=None, index_col=None, parse_cols=None, usecols=None, squeeze=False, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, keep_default_na=True, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True)
writer = pd.ExcelWriter('D:/Users/Zakir/Desktop/MyNotebooks/pandas_simple.xlsx', engine='xlsxwriter')
for index, row in df.iterrows():
df1 = df.iloc[[index]]
df1.set_index('Number',inplace=True)
df1.to_excel(writer, sheet_name=row['IPAddress'])
writer.save()
This is the kind of excel file i have. over 5000 rows. There are 60 groups of ip addresses and have to divide each group into its own worksheet
one solution if you have enough memory:
from pandas import ExcelWriter
df = pd.read_excel('file',sheet_name="Total", header=0, #other settings.....#)
writer = ExcelWriter('E:/output.xlsx',engine='xlsxwriter')
print(df)
def writesheet(g):
a = g['IPAddress'].tolist()[0]
g.to_excel(writer, sheet_name=str(a), index=False)# index = True if you want to keep index
df.groupby('IPAddress').apply(writesheet)
writer.save()
This is how I implemented the code to check a folder, loop through all excel files and split each file by the values of a column name, which can be passed as input(vColName), assuming one sheet in the file:
import sys
import os, shutil
from os import listdir
from os.path import isfile, join
import pandas as pd
import urllib as ul
import datetime
import xlrd
#this method retrieves all the xlsx filenames from a folder
def find_excel_filenames( path_to_dir, suffix=".xlsx" ):
filenames = listdir(path_to_dir)
return [ filename for filename in filenames if filename.endswith( suffix ) ]
#this folder contains .xlsx files
filePath = "D:\files\sample\"
#there is a subfolder in my solution to move the processed files to
#and another subfolder to move the splitted output files
archivePath = os.path.join(filePath, "archive")
outPath = os.path.join(filePath, "output")
#get a list of filenames
fnames = find_excel_filenames(filePath)
#loop through each file
for fl in fnames:
vFile = os.path.join(filePath, fl)
#load the content of the file to a data frame,
#I open the file twice, first to get the number of columns and
#create the converter, then to open the file with string converter
#it helps with trimming of leading zeros
df = pd.read_excel(vFile, header=None)
column_list = []
for i in df:
column_list.append(i)
converter = {col: str for col in column_list}
df1 = pd.read_excel(vFile, converters=converter)
colValues=df1[vColName].unique().tolist()
for v in colValues:
filteredDF = df1.loc[df1[vColName]==v]
vOutFile = os.path.join(outPath, fl+''_''+v.replace("/"," ")+''.xlsx'')
writer = pd.ExcelWriter(vOutFile, engine=''xlsxwriter'')
# Convert the dataframe to an XlsxWriter Excel object.
filteredDF.to_excel(writer, sheet_name=''Sheet1'')
# Close the Pandas Excel writer and output the Excel file.
writer.save()
#move the processed file to an archive folder
dst_file = os.path.join(archivePath, fl)
if os.path.exists(dst_file):
os.remove(dst_file)
shutil.move(vFile, archivePath)

Convert multiple xlsm files automatically to multiple csv files by using pandas

I have 300 raw datas (.xlsm) and wanne to extract useful datas and turn them to csv files as input for subsequent neural network, now i try to implement them with 10 datas as example, i have sucessfully extracted the informations what i need, but i dont know how to convert them to csv files with the same name, for single data we can use df.to_csv, but how about for all the datas? with for function?
import glob
import pandas as pd
import numpy as np
import csv
import os
excel_files = glob.glob('../../Versuch/Versuche/RohBeispiel/*.xlsm')
directory = '/Beispiel'
for files in excel_files:
data = pd.read_excel(files)
# getting the list of rows and columns you need
list_of_dfs = pd.DataFrame(data.values[0:600:,12:26],
columns=data.columns[12:26]).drop(['Sauberkeit', 'Temparatur'], axis=1)
# converting pandas dataframe columns to numeric: string into float
cols = ['KonzA', 'KonzB', 'KonzC', 'TempA',
'TempB', 'TempC', 'Modul1', 'Modul2',
'Modul3', 'Modul4', 'Modul5', 'Modul6']
list_of_dfs[cols] = list_of_dfs[cols].apply(pd.to_numeric, errors='coerce', axis=1)
# Filling down from a column through missing data
for fec in list_of_dfs[cols]:
list_of_dfs[fec].fillna(method='ffill', inplace=True)
csvfilename = files.split('/')[-1].split('.')[0] + '.csv'
newtempfile = os.path.join(directory,csvfilename)
print(newtempfile)
print(list_of_dfs.head(2))
problem is solved.
folder_name = 'Beispiel'
csvfilename = files.split('/')[-1].split('.')[0] + '.csv' # change into csv files
newtempfile = os.path.join(folder_name, csvfilename)
# Verify if directory exists
if not os.path.exists(folder_name):
os.makedirs(folder_name) # If not, create it
print(newtempfile)
list_of_dfs.to_csv(newtempfile, index=False)
The easiest way of doing this is to get the filename from the excel and then use the os.path.join() method to save it to the directory you want.
directory = "C:/Test"
for files in excel_files:
csvfilename = (os.path.basename(file)[-1]).replace('.xlsm','.csv')
newtempfile=os.path.join(directory,csvfilename)
Since you already have the excel df you want to push into the csv file, just add the above code to the loop and change the output csv file to 'newtempfile' and that should do it.
df.to_csv(newtempfile, 'Beispel/data{0}.csv'.format(idx))
Hope this helps. :)
Updated Code:
cols = ['KonzA', 'KonzB', 'KonzC', 'TempA',
'TempB', 'TempC', 'Modul1', 'Modul2',
'Modul3', 'Modul4', 'Modul5', 'Modul6']
excel_files = glob.glob('../../Versuch/Versuche/RohBeispiel/*.xlsm')
for file in excel_files:
data = pd.read_excel(file, columns = cols) # import only the columns you need to the dataframe
csvfilename = (os.path.basename(files)[-1]).replace('.xlsm','.csv')
newtempfile=os.path.join(directory,csvfilename)
# converting pandas dataframe columns to numeric: string into float
data[cols] = data[cols].apply(pd.to_numeric, errors='coerce', axis=1)
data[cols].fillna(method='ffill', inplace=True)
data.to_csv(newtempfile).format(idx)

Categories

Resources