I am using python-watchdog PatternMatchingEventHandler to listen to any files with .xlsx extension. If any excel file is loaded to Home_Folder then it creates two folders Excel and CSV. Under Excel folder the loaded excel files are updated so that data starts with row 1. Under CSV folder, the transformed excel files are converted to csv. Below code is working fine. However, I was wondering if there is a way to simplify below code ? For example if you notice I am calling working directory again in main function. I am new to OOP not sure how to simplify below code? Any help is much appreciated!
Thanks in advance!
Python Code
import csv
from pathlib import Path
import openpyxl
from openpyxl import load_workbook,Workbook
import os
import pathlib
import glob
from watchdog.observers import Observer
from watchdog.events import PatternMatchingEventHandler
import time
import logging
from logging.handlers import RotatingFileHandler
def createFolders(HOME_FOLDER):
folders_name=['Excel','CSV']
for i in folders_name:
pathlib.Path(HOME_FOLDER+ i).mkdir(parents=True, exist_ok=True)
ALLOWED_EXTENSIONS = set(['xlsx'])
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
class FileWatcher(PatternMatchingEventHandler):
patterns = ["*.xlsx"] # Matching the file with extension .xlsx
def process(self, event):
# event.src_path will be the full file path
# event.event_type will be 'created', 'moved', etc.
print("{} noticed: {} on: {} ".format(time.asctime(),event.event_type, event.src_path))
def on_created(self, event):
self.process(event) #Calling above process
self.main(event) # Calling below main event
MyPWD = os.getcwd() #Is this redundant ?
for filename in os.listdir(MyPWD): #Is this redundant ?
path = os.path.join(MyPWD, filename) #Is this redundant ?
if os.path.isfile(path) and allowed_file(filename): #Is this redundant ?
XLFILE = filename #Is this redundant ?
def main(self,event):
def get_first_nonempty_row():
print('get_first_nonempty_row, mr=', max_row_in_sheet)
first_nonempty_row = None
for row in range(1, max_row_in_sheet + 1):
print('checking row', row)
for col in range(1, max_col_in_sheet + 1):
if sheet.cell(row, col).value is not None:
first_nonempty_row = row
print('first_nonempty_row =', first_nonempty_row)
return first_nonempty_row
return first_nonempty_row
def del_rows_before(first_nonempty_row):
modified = False
if first_nonempty_row > 1:
print('del_rows_before', first_nonempty_row)
print('deleting from 1 to', first_nonempty_row - 1)
modified = True
sheet.delete_rows(1, first_nonempty_row - 1)
return modified
#Splitting excel sheets into separate excel files
MyPWD = os.getcwd() #Is this redundant ?
workbooks = glob.iglob('*.xlsx') # Is this reduntant. Since Pattern handler is matching .xlsx files ?
for filename in os.listdir(MyPWD):
path = os.path.join(MyPWD, filename)
if os.path.isfile(path) and allowed_file(filename):
for workbook in workbooks:
print('reading:', workbook)
wb2 = openpyxl.load_workbook(workbook)
for sheet in wb2.worksheets:
new_wb = Workbook()
ws = new_wb.active
ws.title=sheet.title
for row_data in sheet.iter_rows():
for row_cell in row_data:
ws[row_cell.coordinate].value = row_cell.value
f_name=os.path.basename(workbook)
new_wb.save("/Excel/"+f_name[:-5]+"-"+sheet.title+"_Updated.xlsx")
new_wb.close()
#Splitting transformed excel Files. So empty rows are removed and data starst from row 1
Excel_file_path="/Excel"
for file in Path(Excel_file_path).glob('*_Updated.xlsx'):
wb=load_workbook(file)
wb_modified = False
for sheet in wb.worksheets:
max_row_in_sheet = sheet.max_row
max_col_in_sheet = sheet.max_column
print("this is max",max_row_in_sheet)
sheet_modified = False
if max_row_in_sheet >= 1:
first_nonempty_row = get_first_nonempty_row() # Function to find nonempty row
sheet_modified = del_rows_before(first_nonempty_row) #Function to delete nonempty roW
file_name = os.path.basename(file)
wb.save("/Excel/"+file_name[:-13]+"_Transformed.xlsx") #Converting Updated file to transformed file
wb.close()
#### Converting Files to CSV
ExcelPath ='/Excel'
CSV_FILE_PATH = 'CSV/'
for file in Path(ExcelPath).glob('*_Transformed.xlsx'): # Getting files with _Transformed.xlsx to convert to csv
wb = load_workbook(file)
print(file, wb.active.title)
for sheetname in wb.sheetnames:
with open(CSV_FILE_PATH+f'{file.stem[:-12]}.csv', 'w',encoding="utf-8-sig") as csvfile:
spamwriter = csv.writer(csvfile)
for row in wb[sheetname].rows:
spamwriter.writerow([cell.value for cell in row])
if __name__ == '__main__':
logging.basicConfig(handlers=[RotatingFileHandler('./my_log.log', maxBytes=100000, backupCount=10)],
level=logging.DEBUG,format="%(message)s")
HOME_FOLDER = 'Files to be tested/' #Folder to be watched
obs = Observer()
obs.schedule(FileWatcher(), path= HOME_FOLDER)
print("Monitoring started....")
createFolders(HOME_FOLDER)
#main()
obs.start() # Start watching
try:
while obs.isAlive():
obs.join()
finally:
obs.stop()
obs.join()
Before File Directory Structure
C:\Desktop\Jupyter Notebook\Files to be tested
After File Directory Structure
C:\Desktop\Jupyter Notebook\Files to be tested <- This Path has created folders Excel and CSV, and this is the path where excel files are getting uploaded and File listener is listening to this path ONLY.In below GIF image "Myexcel_file1","Myexcel_file2","Myexcel_file3" are uploaded.
C:\Desktop\Jupyter Notebook\Files to be tested\Excel <- This path contains transformed,updated excel files.
C:\Desktop\Jupyter Notebook\Files to be tested\CSV <- The "_transformed.xlsx" files from above Excel folder is getting converted to csv and saved into this path
GIF
Related
I have the below code, the folder to where it contains the Excel file with two columns (Original Name of the PDF file, and New Name desired column called "Matched Results.xls"; as well as all of the original name PDF files that are contained in this folder. How do I run this code so that all of my PDFs will be renamed?
def rename_file(file_to_rename, source_file):
p = Path(file_to_rename)
filename = p.stem
wb = xlrd.open_workbook(r'C:\\Users\Chris Lee\\Desktop\\File_Rename_Python\\Matched_Results.xls')
# excel file to get new filename????
sheet = wb.sheet_by_index(0)
for row_num in range(sheet.nrows):
row_value = sheet.row_values(row_num)
col = 2 # 'john smith' col number
if row_value[col] == filename:
new_filename = f'{row_value[col+1]}' # format as you want
p.rename(Path(p.parent, new_filename + p.suffix)) # rename
break
def get_paths_in_directory(directory):
return Path(directory).glob('*.pdf')
if __name__ == "__main__":
source_file = r'C:\\Users\Chris Lee\\Desktop\\File_Rename_Python\\Matched_Results.xlsx' # excel file to get new filename
source_directory = r'C:\\Users\Chris Lee\\Desktop\\File_Rename_Python' # directory where your files to rename are.
# iterate all pdf files in the given directory
paths = get_paths_in_directory(source_directory)
for file_to_rename in paths:
rename_file(str(file_to_rename), source_file)
The point of if __name__ == "__main__": is to check if the code was run directly and not imported. Since that you are not importing:
def rename_file(file_to_rename, source_file):
p = Path(file_to_rename)
filename = p.stem
wb = xlrd.open_workbook(r'C:\\Users\Chris Lee\\Desktop\\File_Rename_Python\\Matched_Results.xls')
# wb should be the below variable instead
#wb = xlrd.open_workbook(source_file)
# excel file to get new filename????
sheet = wb.sheet_by_index(0)
for row_num in range(sheet.nrows):
row_value = sheet.row_values(row_num)
col = 2 # 'john smith' col number
if row_value[col] == filename:
new_filename = f'{row_value[col+1]}' # format as you want
p.rename(Path(p.parent, new_filename + p.suffix)) # rename
print("the if was matched!", new_filename, Path(p.parent, new_filename + p.suffix))
break
def get_paths_in_directory(directory):
return Path(directory).glob('*.pdf')
if __name__ == "__main__":
source_file = r'C:\\Users\Chris Lee\\Desktop\\File_Rename_Python\\Matched_Results.xlsx' # excel file to get new filename
source_directory = r'C:\\Users\Chris Lee\\Desktop\\File_Rename_Python' # directory where your files to rename are.
# iterate all pdf files in the given directory
paths = get_paths_in_directory(source_directory)
for file_to_rename in paths:
rename_file(str(file_to_rename), source_file)
You can replace everything after and including if __name__ == "__main__": with:
def my_func():
source_file = r'C:\\Users\Chris Lee\\Desktop\\File_Rename_Python\\Matched_Results.xlsx' # excel file to get new filename
source_directory = r'C:\\Users\Chris Lee\\Desktop\\File_Rename_Python' # directory where your files to rename are.
# iterate all pdf files in the given directory
paths = get_paths_in_directory(source_directory)
for file_to_rename in paths:
rename_file(str(file_to_rename), source_file)
and call the function with:
my_func()
so your whole jupyter field would look like:
def rename_file(file_to_rename, source_file):
p = Path(file_to_rename)
filename = p.stem
wb = xlrd.open_workbook(r'C:\\Users\Chris Lee\\Desktop\\File_Rename_Python\\Matched_Results.xls')
# excel file to get new filename????
sheet = wb.sheet_by_index(0)
for row_num in range(sheet.nrows):
row_value = sheet.row_values(row_num)
col = 3 # 'john smith' col number
if row_value[col] == filename:
new_filename = f'{row_value[col+1]}' # format as you want
p.rename(Path(p.parent, new_filename + p.suffix)) # rename
print("the if was matched!", new_filename, Path(p.parent, new_filename + p.suffix))
break
def get_paths_in_directory(directory):
return Path(directory).glob('*.pdf')
def my_func():
source_file = r'C:\\Users\Chris Lee\\Desktop\\File_Rename_Python\\Matched_Results.xlsx' # excel file to get new filename
source_directory = r'C:\\Users\Chris Lee\\Desktop\\File_Rename_Python' # directory where your files to rename are.
# iterate all pdf files in the given directory
paths = get_paths_in_directory(source_directory)
for file_to_rename in paths:
rename_file(str(file_to_rename), source_file)
my_func()
provided you have all the needed imports.
EDIT: A function I put together with some threads on SO - you will need to figure out how you want to rename your file, create the algorithm, then implement it in the if statement.
import os
def rename_func(directory):
d = os.fsencode(directory)
for file in os.listdir(d):
filename = os.fsdecode(file)
if filename.endswith(".pdf"):
# this is what your file will be renamed to
os.rename(os.path.join(d, filename), 'renamed_file.pdf')
rename_func(r"C:\my_dir")
I try to convert a .xlsx file into .csv file whenever a new file is added into the Inputfolder and put the conversion .csv file in the OutputFolder.
import glob
import time
import os
import pandas as pd
#Get timestamp
timestr = time.strftime("%Y%m%d_%H%M%S")
#Input file path
input_filepath = 'C:/Documents/InputFile'
folderSize = 0
#Function to convert file
def format_csv(latest_file):
#Output file path
filenamepath = 'C:/Documents/OutputFile/' + timestr + '.csv'
read_Excelfile = pd.read_excel(latest_file)
read_Excelfile.to_csv(filenamepath, index=None, header=True)
while True:
checkFolder = folderSize
folderSize = 0
#Check the size of the Input Folder
for path, dirs, files in os.walk(input_filepath):
for f in files:
fp = os.path.join(path, f)
folderSize += os.path.getsize(fp)
print(folderSize)
#Create new .csv file if the Input folder has new file added
if(folderSize > checkFolder):
list_of_files = glob.glob('C:/Documents/InputFile/*.xlsx')
latest_file = max(list_of_files, key=os.path.getctime)
format_csv(latest_file)
print(latest_file)
time.sleep(15)
Right now the program will only convert the first .xlsx file only. If I add a new .xlsx file into InputFolder, the file is not converted.
You could try something like reading through the folder for all .xlsx files if it finds one convert that to .csv
Here we are reading through the directory for all xlsx files, converting them by creating copies in csv version and then deleting the original xlsx version
import pandas as pd
import os
path = 'C:/Documents/InputFile'
files = os.listdir(path)
for file in files:
if '.xlsx' in file:
filename = file[:-5]
new_filename = path + "/" + filename + ".csv"
if filename + ".csv" in files:
pass
else:
df = pd.read_excel(file)
df.to_csv(new_filename)
I already improvise my original code. So, whenever I put a new excel file into InputFolder, the program will convert the file to .csv format and insert the formatted file in OutputFolder
import glob
import time
import os
import pandas as pd
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
#Function if new file is created in the folder
def on_created(event):
list_of_files = glob.glob('C:/Users/Documents/InputFolder/*.xlsx')
latest_file = max(list_of_files, key=os.path.getctime)
format_csv(latest_file)
#Function to convert .xlsx to .csv
def format_csv(latest_file):
# Get timestamp
timestr = time.strftime("%d%m%Y_%H%M%S")
#Output file path
filenamepath = 'C:/Users/Documents/OutputFolder/' + timestr + '.csv'
read_Excelfile = pd.read_excel(latest_file)
read_Excelfile.to_csv(filenamepath, index=None, header=True)
print(filenamepath)
if __name__ == "__main__":
event_handler = FileSystemEventHandler()
#Calling function for file insertion
event_handler.on_created = on_created
#Input Folder
path = 'C:/Users/Documents/InputFolder'
#Function to observe file
observer = Observer()
observer.schedule(event_handler, path, recursive=True)
observer.start()
try:
#Check every one second
while True:
time.sleep(1)
except KeyboardInterrupt:
#Program stop if keyboard interupt
observer.stop()
observer.join()
hi guys currently moving file based on filename on my csv file but it always move the files first and then read the filename so it always got error already exist like this
Error: Destination path 'Sortir/Membuka kertas contekan/aug1_Data16_133_86.jpg' already exists
CODE
import pandas as pd
data = pd.read_csv('train.csv')
filenames = data['filename'].values
filenames = filenames.tolist()
classes = data['class'].values
classes = classes.tolist()
print(filenames)
print(classes)
import shutil
import os
for index, row in data.iterrows():
print(row['filename'], os.path.join("Sortir",row['class']))
if not os.path.exists(os.path.join("Sortir",row['class'])):
print("[INFO] 'creating {}' directory".format(os.path.join("Sortir",row['class'])))
os.mkdir(os.path.join("Sortir",row['class']))
shutil.move(os.path.join("images",row["filename"]), os.path.join("Sortir",row['class']))
Anyone know how to do the read the row first and then move the file? or maybe keep continue to read other row even if the file that I want to move being already moved?
Found the Answer Code here :
import shutil
import os
import pandas as pd
data = pd.read_csv('test.csv')
filenames = data['filename'].values
filenames = filenames.tolist()
classes = data['class'].values
classes = classes.tolist()
print(filenames)
print(classes)
for index, row in data.iterrows():
if not os.path.exists(os.path.join("SortirTest",row['class'])):
print("[INFO] 'creating {}' directory".format(os.path.join("SortirTest",row['class'])))
os.mkdir(os.path.join("SortirTest",row['class']))
input_name = os.path.join("images", row["filename"])
output_name = os.path.join("SortirTest", row['class'], row['filename'])
if os.path.exists(input_name):
dest = shutil.move(input_name, output_name)
print("This File Has Been Moved:", input_name)
else:
print("This File Doesnt Exist :", input_name)
continue
In shutil.move() function you have to add the filename in the new directory too:
input_name = os.path.join("images", row["filename"])
output_name = os.path.join("Sortir", row['class'], row['filename'])
shutil.move(input_name, output_name)
Have you tried to clear the 'Sortir' folder before running the script?
I am new with Python but trying to write a code which add a column on multiple .xlsx files and saves this files with the origin name to a new folder.
I have started with some coding beneath, but missing some code in open all files and saving to my DestPath. Would be pleased if any has a solution for this:
from os import listdir, path
import pandas as pd
import xlrd
SourcePath = 'C:\' #Source Path
DestPath = 'C:\' #Destination Path
# Listing up all .xlsx files from Source
def find_xlsx_filenames( path_to_dir, suffix=".xlsx" ):
filenames = listdir(path_to_dir)
return [ filename for filename in filenames if filename.endswith( suffix ) ]
filenames = find_xlsx_filenames(SourcePath)
fname = path.join(SourcePath, filenames[0]) # Tar første fil i mappa.
outname = path.join(outputdata, filenames[0])
for i in range(len(filenames)):
fname = path.join(SourcePath, filenames[i])
df = pd.read_excel(fname) #Read Excel file as a DataFrame
df['new_col'] = 'Sort Data' #Adding a new column named <Sort Data>
#To save it back as Excel
df.to_excel(DestPath, outname) #Write DateFrame back as Excel file
Thanks in Advance
check if this works
import os
import pandas as pd
path = 'C:/'
for roots, dirs, files in os.walk(path):
xlsfile = [ _ for _ in files if _.endswith('.xlsx')]
for xlsf in xlsfile:
df = pd.read_excel(os.path.join(roots, xlsf))
df['Sort Data'] = ' '
df.to_excel(os.path.join(roots, xlsf), index = False)
I have multiple directories, each of which containing any number of .xls files.
I'd like to take the files in any given directory and combine them into one .xls file, using the file names as the tab names.
For example if there are the files NAME.xls, AGE.xls, LOCATION.xls, I'd like to combine them into a new file with the data from NAME.xls on a tab called NAME, the data from AGE.xls on a tab called AGE and so on.
Each source .xls file only has one column of data with no headers.
This is what I have so far, and well it's not working.
Any help would be greatly appreciated (I'm fairly new to Python and I've never had to do anything like this before).
wkbk = xlwt.Workbook()
xlsfiles = glob.glob(os.path.join(path, "*.xls"))
onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
tabNames = []
for OF in onlyfiles:
if str(OF)[-4:] == ".xls":
sheetName = str(OF)[:-4]
tabNames.append(sheetName)
else:
pass
for TN in tabNames:
outsheet = wkbk.add_sheet(str(TN))
data = pd.read_excel(path + "\\" + TN + ".xls", sheet_name="data")
data.to_excel(path + "\\" + "Combined" + ".xls", sheet_name = str(TN))
Here is a small helper function - it supports both .xls and .xlsx files:
import pandas as pd
try:
from pathlib import Path
except ImportError: # Python 2
from pathlib2 import Path
def merge_excel_files(dir_name, out_filename='result.xlsx', **kwargs):
p = Path(dir_name)
with pd.ExcelWriter(out_filename) as xls:
_ = [pd.read_excel(f, header=None, **kwargs)
.to_excel(xls, sheet_name=f.stem, index=False, header=None)
for f in p.glob('*.xls*')]
Usage:
merge_excel_files(r'D:\temp\xls_directory', 'd:/temp/out.xls')
merge_excel_files(r'D:\temp\xlsx_directory', 'd:/temp/out.xlsx')
Can you try
import pandas as pd
import glob
path = 'YourPath\ToYour\Files\\' # Note the \\ at the end
# Create a list with only .xls files
list_xls = glob.glob1(path,"*.xls")
# Create a writer for pandas
writer = pd.ExcelWriter(path + "Combined.xls", engine = 'xlwt')
# Loop on all the files
for xls_file in list_xls:
# Read the xls file and the sheet named data
df_data = pd.read_excel(io = path + xls_file, sheet_name="data")
# Are the sheet containing data in all your xls file named "data" ?
# Write the data into a sheet named after the file
df_data.to_excel(writer, sheet_name = xls_file[:-4])
# Save and close your Combined.xls
writer.save()
writer.close()
Let me know if it works for you, I never tried engine = 'xlwt' as I don't work with .xls file but .xlsx