I'm working on a script that pulls certain rows of data from multiple Excel workbooks in a folder (the critical sheet has the same name in every workbook). This code seems only to process/print results from the first file in the folder:
import os
import xlrd
for root, dirs, files in os.walk('/Users/123/Desktop/drivingtests'):
xlsfiles=[ _ for _ in files if _.endswith('.xlsx') ]
for xlsfile in xlsfiles:
workbook = xlrd.open_workbook(os.path.join(root,xlsfile))
worksheet = workbook.sheet_by_name('Sheet1')
for row in range(worksheet.nrows):
workbook = xlrd.open_workbook(os.path.join(root,xlsfile))
worksheet = workbook.sheet_by_name('Sheet1')
if worksheet.row_values(row)[0] == 'bike':
print worksheet.row_values(row)
What should be done to have the script process every workbook in the folder?
The answer is "indentions are important". When indented like the code below, it loops through all the files in the folder.
import os
import xlrd
for root, dirs, files in os.walk('/Users/123/Desktop/drivingtests'):
xlsfiles=[ _ for _ in files if _.endswith('.xlsx') ]
for xlsfile in xlsfiles:
workbook = xlrd.open_workbook(os.path.join(root,xlsfile))
worksheet = workbook.sheet_by_name('Sheet1')
for row in range(worksheet.nrows):
if worksheet.row_values(row)[0] == 'bike':
print worksheet.row_values(row)
Related
I am using python-watchdog PatternMatchingEventHandler to listen to any files with .xlsx extension. If any excel file is loaded to Home_Folder then it creates two folders Excel and CSV. Under Excel folder the loaded excel files are updated so that data starts with row 1. Under CSV folder, the transformed excel files are converted to csv. Below code is working fine. However, I was wondering if there is a way to simplify below code ? For example if you notice I am calling working directory again in main function. I am new to OOP not sure how to simplify below code? Any help is much appreciated!
Thanks in advance!
Python Code
import csv
from pathlib import Path
import openpyxl
from openpyxl import load_workbook,Workbook
import os
import pathlib
import glob
from watchdog.observers import Observer
from watchdog.events import PatternMatchingEventHandler
import time
import logging
from logging.handlers import RotatingFileHandler
def createFolders(HOME_FOLDER):
folders_name=['Excel','CSV']
for i in folders_name:
pathlib.Path(HOME_FOLDER+ i).mkdir(parents=True, exist_ok=True)
ALLOWED_EXTENSIONS = set(['xlsx'])
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
class FileWatcher(PatternMatchingEventHandler):
patterns = ["*.xlsx"] # Matching the file with extension .xlsx
def process(self, event):
# event.src_path will be the full file path
# event.event_type will be 'created', 'moved', etc.
print("{} noticed: {} on: {} ".format(time.asctime(),event.event_type, event.src_path))
def on_created(self, event):
self.process(event) #Calling above process
self.main(event) # Calling below main event
MyPWD = os.getcwd() #Is this redundant ?
for filename in os.listdir(MyPWD): #Is this redundant ?
path = os.path.join(MyPWD, filename) #Is this redundant ?
if os.path.isfile(path) and allowed_file(filename): #Is this redundant ?
XLFILE = filename #Is this redundant ?
def main(self,event):
def get_first_nonempty_row():
print('get_first_nonempty_row, mr=', max_row_in_sheet)
first_nonempty_row = None
for row in range(1, max_row_in_sheet + 1):
print('checking row', row)
for col in range(1, max_col_in_sheet + 1):
if sheet.cell(row, col).value is not None:
first_nonempty_row = row
print('first_nonempty_row =', first_nonempty_row)
return first_nonempty_row
return first_nonempty_row
def del_rows_before(first_nonempty_row):
modified = False
if first_nonempty_row > 1:
print('del_rows_before', first_nonempty_row)
print('deleting from 1 to', first_nonempty_row - 1)
modified = True
sheet.delete_rows(1, first_nonempty_row - 1)
return modified
#Splitting excel sheets into separate excel files
MyPWD = os.getcwd() #Is this redundant ?
workbooks = glob.iglob('*.xlsx') # Is this reduntant. Since Pattern handler is matching .xlsx files ?
for filename in os.listdir(MyPWD):
path = os.path.join(MyPWD, filename)
if os.path.isfile(path) and allowed_file(filename):
for workbook in workbooks:
print('reading:', workbook)
wb2 = openpyxl.load_workbook(workbook)
for sheet in wb2.worksheets:
new_wb = Workbook()
ws = new_wb.active
ws.title=sheet.title
for row_data in sheet.iter_rows():
for row_cell in row_data:
ws[row_cell.coordinate].value = row_cell.value
f_name=os.path.basename(workbook)
new_wb.save("/Excel/"+f_name[:-5]+"-"+sheet.title+"_Updated.xlsx")
new_wb.close()
#Splitting transformed excel Files. So empty rows are removed and data starst from row 1
Excel_file_path="/Excel"
for file in Path(Excel_file_path).glob('*_Updated.xlsx'):
wb=load_workbook(file)
wb_modified = False
for sheet in wb.worksheets:
max_row_in_sheet = sheet.max_row
max_col_in_sheet = sheet.max_column
print("this is max",max_row_in_sheet)
sheet_modified = False
if max_row_in_sheet >= 1:
first_nonempty_row = get_first_nonempty_row() # Function to find nonempty row
sheet_modified = del_rows_before(first_nonempty_row) #Function to delete nonempty roW
file_name = os.path.basename(file)
wb.save("/Excel/"+file_name[:-13]+"_Transformed.xlsx") #Converting Updated file to transformed file
wb.close()
#### Converting Files to CSV
ExcelPath ='/Excel'
CSV_FILE_PATH = 'CSV/'
for file in Path(ExcelPath).glob('*_Transformed.xlsx'): # Getting files with _Transformed.xlsx to convert to csv
wb = load_workbook(file)
print(file, wb.active.title)
for sheetname in wb.sheetnames:
with open(CSV_FILE_PATH+f'{file.stem[:-12]}.csv', 'w',encoding="utf-8-sig") as csvfile:
spamwriter = csv.writer(csvfile)
for row in wb[sheetname].rows:
spamwriter.writerow([cell.value for cell in row])
if __name__ == '__main__':
logging.basicConfig(handlers=[RotatingFileHandler('./my_log.log', maxBytes=100000, backupCount=10)],
level=logging.DEBUG,format="%(message)s")
HOME_FOLDER = 'Files to be tested/' #Folder to be watched
obs = Observer()
obs.schedule(FileWatcher(), path= HOME_FOLDER)
print("Monitoring started....")
createFolders(HOME_FOLDER)
#main()
obs.start() # Start watching
try:
while obs.isAlive():
obs.join()
finally:
obs.stop()
obs.join()
Before File Directory Structure
C:\Desktop\Jupyter Notebook\Files to be tested
After File Directory Structure
C:\Desktop\Jupyter Notebook\Files to be tested <- This Path has created folders Excel and CSV, and this is the path where excel files are getting uploaded and File listener is listening to this path ONLY.In below GIF image "Myexcel_file1","Myexcel_file2","Myexcel_file3" are uploaded.
C:\Desktop\Jupyter Notebook\Files to be tested\Excel <- This path contains transformed,updated excel files.
C:\Desktop\Jupyter Notebook\Files to be tested\CSV <- The "_transformed.xlsx" files from above Excel folder is getting converted to csv and saved into this path
GIF
In a folder, I have 50 excel files with multiple sheets in each file. I have to update the name of the sheet in these files where ever the sheet_name contains "XYZ".
So for each file, if the sheet_name has "XYZ", change that sheet_name to "ABC". I tried looping through the files using the following code but could not write code to change sheet names :
filelist=[]
for path, subdirs, files in os.walk(directory):
for file in files:
if (file.endswith('.xlsx') or file.endswith('.xls') or file.endswith('.XLS')):
filelist.append(os.path.join(path, file))
You can simplify like that to list files:
import os
mypath = r'C:\your\files\path'
filenames = [x for x in os.listdir(mypath) if x.endswith('.xls') or x.endswith('.xlsx') or x.endswith('.XLS')]
for filename in filenames:
a = filename.replace('XYZ','ABC')
os.rename(mypath+"/"+filename,mypath+"/"+a)
You can use openpyxl and glob
import glob
from openpyxl import load_workbook
paths = glob.glob("directory*xls*") + glob.glob("directory*XLS*")
for path in paths:
wb = load_workbook(path)
for sheetname in wb.sheetnames:
ws = wb[sheetname]
if "XYZ" in ws.title:
ws.title = "ABC"
wb.save(path)
Current python code not copying data over from multiple excels to one master excel, no errors?? Does anyone know what I have done wrong?
Sheets name Sheet1
Also need to incorporate password entry which I think I have done.
import os
import pandas as pd
import openpyxl
cwd = os.path.abspath(r'C:/Users/eldri/OneDrive/Desktop/test/')
cwd = cwd.replace("'\'", "'/'")
files = os.listdir(cwd)
shtname = ('Sheet1')
print (cwd)
print (files)
xltot = pd.DataFrame()
for file in files:
if file.endswith('*.xlsx'):
excel_file.security.workbookPassword = 'password'
excel_file = pd.ExcelFile(file)
sheets = excel_file.shtname
for sheet in sheets:
xl = excel_file.parse(shtname)
xltot = xltot.append(xl)
xltot.to_excel('compiled_xl.xlsx', 'Compiled Excels')
print("Done")
print shows:
C:\Users\eldri\OneDrive\Desktop\test
['cgnbcgnn.xlsx', 'cgncgn.xlsx', 'cgncgnc.xlsx', 'cgng.xlsx', 'fdgfg.xlsx', 'gcncgn.xlsx', 'gcngn.xlsx', 'nfvn.xlsx']
Any help would be great thanks
I am new with Python but trying to write a code which add a column on multiple .xlsx files and saves this files with the origin name to a new folder.
I have started with some coding beneath, but missing some code in open all files and saving to my DestPath. Would be pleased if any has a solution for this:
from os import listdir, path
import pandas as pd
import xlrd
SourcePath = 'C:\' #Source Path
DestPath = 'C:\' #Destination Path
# Listing up all .xlsx files from Source
def find_xlsx_filenames( path_to_dir, suffix=".xlsx" ):
filenames = listdir(path_to_dir)
return [ filename for filename in filenames if filename.endswith( suffix ) ]
filenames = find_xlsx_filenames(SourcePath)
fname = path.join(SourcePath, filenames[0]) # Tar første fil i mappa.
outname = path.join(outputdata, filenames[0])
for i in range(len(filenames)):
fname = path.join(SourcePath, filenames[i])
df = pd.read_excel(fname) #Read Excel file as a DataFrame
df['new_col'] = 'Sort Data' #Adding a new column named <Sort Data>
#To save it back as Excel
df.to_excel(DestPath, outname) #Write DateFrame back as Excel file
Thanks in Advance
check if this works
import os
import pandas as pd
path = 'C:/'
for roots, dirs, files in os.walk(path):
xlsfile = [ _ for _ in files if _.endswith('.xlsx')]
for xlsf in xlsfile:
df = pd.read_excel(os.path.join(roots, xlsf))
df['Sort Data'] = ' '
df.to_excel(os.path.join(roots, xlsf), index = False)
I have multiple directories, each of which containing any number of .xls files.
I'd like to take the files in any given directory and combine them into one .xls file, using the file names as the tab names.
For example if there are the files NAME.xls, AGE.xls, LOCATION.xls, I'd like to combine them into a new file with the data from NAME.xls on a tab called NAME, the data from AGE.xls on a tab called AGE and so on.
Each source .xls file only has one column of data with no headers.
This is what I have so far, and well it's not working.
Any help would be greatly appreciated (I'm fairly new to Python and I've never had to do anything like this before).
wkbk = xlwt.Workbook()
xlsfiles = glob.glob(os.path.join(path, "*.xls"))
onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
tabNames = []
for OF in onlyfiles:
if str(OF)[-4:] == ".xls":
sheetName = str(OF)[:-4]
tabNames.append(sheetName)
else:
pass
for TN in tabNames:
outsheet = wkbk.add_sheet(str(TN))
data = pd.read_excel(path + "\\" + TN + ".xls", sheet_name="data")
data.to_excel(path + "\\" + "Combined" + ".xls", sheet_name = str(TN))
Here is a small helper function - it supports both .xls and .xlsx files:
import pandas as pd
try:
from pathlib import Path
except ImportError: # Python 2
from pathlib2 import Path
def merge_excel_files(dir_name, out_filename='result.xlsx', **kwargs):
p = Path(dir_name)
with pd.ExcelWriter(out_filename) as xls:
_ = [pd.read_excel(f, header=None, **kwargs)
.to_excel(xls, sheet_name=f.stem, index=False, header=None)
for f in p.glob('*.xls*')]
Usage:
merge_excel_files(r'D:\temp\xls_directory', 'd:/temp/out.xls')
merge_excel_files(r'D:\temp\xlsx_directory', 'd:/temp/out.xlsx')
Can you try
import pandas as pd
import glob
path = 'YourPath\ToYour\Files\\' # Note the \\ at the end
# Create a list with only .xls files
list_xls = glob.glob1(path,"*.xls")
# Create a writer for pandas
writer = pd.ExcelWriter(path + "Combined.xls", engine = 'xlwt')
# Loop on all the files
for xls_file in list_xls:
# Read the xls file and the sheet named data
df_data = pd.read_excel(io = path + xls_file, sheet_name="data")
# Are the sheet containing data in all your xls file named "data" ?
# Write the data into a sheet named after the file
df_data.to_excel(writer, sheet_name = xls_file[:-4])
# Save and close your Combined.xls
writer.save()
writer.close()
Let me know if it works for you, I never tried engine = 'xlwt' as I don't work with .xls file but .xlsx