Update a specific sheet name in all the excels inside a folder - python

In a folder, I have 50 excel files with multiple sheets in each file. I have to update the name of the sheet in these files where ever the sheet_name contains "XYZ".
So for each file, if the sheet_name has "XYZ", change that sheet_name to "ABC". I tried looping through the files using the following code but could not write code to change sheet names :
filelist=[]
for path, subdirs, files in os.walk(directory):
for file in files:
if (file.endswith('.xlsx') or file.endswith('.xls') or file.endswith('.XLS')):
filelist.append(os.path.join(path, file))

You can simplify like that to list files:
import os
mypath = r'C:\your\files\path'
filenames = [x for x in os.listdir(mypath) if x.endswith('.xls') or x.endswith('.xlsx') or x.endswith('.XLS')]
for filename in filenames:
a = filename.replace('XYZ','ABC')
os.rename(mypath+"/"+filename,mypath+"/"+a)

You can use openpyxl and glob
import glob
from openpyxl import load_workbook
paths = glob.glob("directory*xls*") + glob.glob("directory*XLS*")
for path in paths:
wb = load_workbook(path)
for sheetname in wb.sheetnames:
ws = wb[sheetname]
if "XYZ" in ws.title:
ws.title = "ABC"
wb.save(path)

Related

Invalid extension for engine problem, iterating through directories and files

I have a code, which is working properly if I manually insert strings for path, directory and file name, here is the code:
path = r"test//ab3b//ab3b_all_anal.xlsx"
directory = "test"
file1 = "test//ab3b//ab3b80.csv"
df1 = all_calc_80(file1, directory)
file2 = "test//ab3b//ab3b80m.csv"
df2 = all_calc_80m(file2, directory)
writer = pd.ExcelWriter(path, engine = 'xlsxwriter')
df1.to_excel(writer, sheet_name = '80')
df2.to_excel(writer, sheet_name = '80m')
writer.close()
Test directory has subdirectories named as ab3b, bg3a, ge3b etc. and in each of subdirectories there are files named in same way: ab3b80.csv, ab3b80m.csv; bg3a80.csv, bg3a80m.csv; ge3b80.csv, ge3b80m.csv.
Each of files based on ending 80.csv or 80m.csv use different function for analysing.
The final output is one excel workbook with sheets names after ending of csv files.
Now I am working on iterating through whole directory test, where I just give the name of the directory and everything is proceed automatically from there. So far I have:
import os
import xlsxwriter
rootdir = 'test'
slovar = {}
for subdir, dirs, files in os.walk(rootdir):
slovar[subdir] = files
for key, value in slovar.items():
if len(key) > 4: #to get just subdirectories I need
end = key[-4:]
path = 'r' + '\'' + key + '\\\\' + end + '_all_anal.xlsx' + '\''
print(path)
for vrednost in value:
if vrednost.endswith('80.csv'):
file1 = vrednost
df1 = all_calc_80(file1, rootdir)
elif vrednost.endswith('80m.csv'):
file2 = vrednost
df2 = all_calc_80m(file2, rootdir)
writer = pd.ExcelWriter(path, engine = 'xlsxwriter')
df1.to_excel(writer, sheet_name = '80')
df2.to_excel(writer, sheet_name = '80m')
writer.close()
But I got error message: Invalid extension for engine '<property object at 0x000002123659D0E0>': 'xlsx''.
I think there might be some problems due to /and \ in windows paths or types of object, even though when I print out just keys and values, I get usefull output, also name of the path is written properly.
But I don't really understand why manually everything works and automated not.
If someone will still search for this answer, I had found a solution.
Main discovery was regarding how to append path and file name to the list.
It is done with os.path.join(dirpath, filename), if you use os.walk.
Here is the working code:
seznam80 = []
seznam80m = []
seznam120 = []
seznam120m = []
seznam150 = []
seznam150m = []
seznamSMT = []
dirp = []
for dirpath, dirnames, filenames in os.walk(directory): #directory with all folders of participants
for filename in [f for f in filenames if f.endswith("80.csv")]: #search for all 80 files
seznam80.append(os.path.join(dirpath, filename))
dirp.append(dirpath)
for dirpath, dirnames, filenames in os.walk(directory): #directory with all folders of participants
for filename in [f for f in filenames if f.endswith("80m.csv")]: #search for all 80m files
seznam80m.append(os.path.join(dirpath, filename))
for vsak80, vsak80m pot in zip(seznam80, seznam80m, dirp):
path = pot + '_all_anal.xlsx'
file1 = vsak80
df1 = all_calc_80(file1, directory)
file2 = vsak80m
df2 = all_calc_80m(file2, directory)
writer = pd.ExcelWriter(path, engine = 'xlsxwriter')
df1.to_excel(writer, sheet_name = '80')
df2.to_excel(writer, sheet_name = '80m')
writer.close()

Renaming files based on Dataframe content with Python and Pandas

I am trying to read a xlsx file, compare all the reference numbers from a column to files inside a folder and if they correspond, rename them to an email associate with the reference number.
Excel File has fields such as:
Reference EmailAddress
1123 bob.smith#yahoo.com
1233 john.drako#gmail.com
1334 samuel.manuel#yahoo.com
... .....
My folder applicants just contains doc files named as the Reference column:
How can I compare the contents of the applicantsCVs folder, to the Reference field inside my excel file and if it matches, rename all of the files as the corresponding email address ?
Here is What I've tried so far:
import os
import pandas as pd
dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")
references = dfOne['Reference']
emailAddress = dfOne['EmailAddress']
cleanedEmailList = [x for x in emailAddress if str(x) != 'nan']
print(cleanedEmailList)
excelArray = []
filesArray = []
for root, dirs, files in os.walk("applicantCVs"):
for filename in files:
print(filename) #Original file name with type 1233.doc
reworkedFile = os.path.splitext(filename)[0]
filesArray.append(reworkedFile)
for entry in references:
excelArray.append(str(entry))
for i in excelArray:
if i in filesArray:
print(i, "corresponds to the file names")
I compare the reference names to the folder contents and print it out if it's the same:
for i in excelArray:
if i in filesArray:
print(i, "corresponds to the file names")
I've tried to rename it with os.rename(filename, cleanedEmailList ) but it didn't work because cleanedEmailList is an array of emails.
How can I match and rename the files?
Update:
from os.path import dirname
import pandas as pd
from pathlib import Path
import os
dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")
emailAddress = dfOne['EmailAddress']
reference = dfOne['Reference'] = dfOne.references.astype(str)
references = dict(dfOne.dropna(subset=[reference, "EmailAddress"]).set_index(reference)["EmailAddress"])
print(references)
files = Path("applicantCVs").glob("*")
for file in files:
new_name = references.get(file.stem, file.stem)
file.rename(file.with_name(f"{new_name}{file.suffix}"))
based on sample data:
Reference EmailAddress
1123 bob.smith#yahoo.com
1233 john.drako#gmail.com
nan jane.smith#example.com
1334 samuel.manuel#yahoo.com
First you assemble a dict with the set of references as keys and the new names as values:
references = dict(df.dropna(subset=["Reference","EmailAddress"]).set_index("Reference")["EmailAddress"])
{'1123': 'bob.smith#yahoo.com',
'1233': 'john.drako#gmail.com',
'1334': 'samuel.manuel#yahoo.com'}
Note that the references are strs here. If they aren't in your original database, you can use astype(str)
Then you use pathlib.Path to look for all the files in the data directory:
files = Path("../data/renames").glob("*")
[WindowsPath('../data/renames/1123.docx'),
WindowsPath('../data/renames/1156.pptx'),
WindowsPath('../data/renames/1233.txt')]
The renaming can be made very simple:
for file in files:
new_name = references.get(file.stem, file.stem )
file.rename(file.with_name(f"{new_name}{file.suffix}"))
The references.get asks for the new filename, and if it doesn't find it, use the original stem.
[WindowsPath('../data/renames/1156.pptx'),
WindowsPath('../data/renames/bob.smith#yahoo.com.docx'),
WindowsPath('../data/renames/john.drako#gmail.com.txt')]
How about adding the "email associate" (your new name i guess?) into an dictionary, where the keys are your reference numbers?
This could look something like:
cor_dict = {}
for i in excelArray:
if i in filesArray:
cor_dict[i] =dfOne['EmailAddress'].at[dfOne.Reference == i]
for entry in cor_dict.items():
path = 'path to file...'
filename = str(entry[0])+'.doc'
new_filename = str(entry[1]).replace('#','_') + '_.doc'
filepath = os.path.join(path, filename)
new_filepath = os.path.join(path,new_filename)
os.rename(filename, new_filename)
This is one approach using a simple iteration.
Ex:
import os
#Sample Data#
#dfOne = pd.DataFrame({'Reference': [1123, 1233, 1334, 4444, 5555],'EmailAddress': ["bob.smith#yahoo.com", "john.drako#gmail.com", "samuel.manuel#yahoo.com", np.nan, "samuel.manuel#yahoo.com"]})
dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")
dfOne.dropna(inplace=True) #Drop rows with NaN
for root, dirs, files in os.walk("applicantsCVs"):
for file in files:
file_name, ext = os.path.splitext(file)
email = dfOne[dfOne['Reference'].astype(str).str.contains(file_name)]["EmailAddress"]
if email.values:
os.rename(os.path.join(root, file), os.path.join(root, email.values[0]+ext))
Or if you have only .docx file to rename
import os
dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")
dfOne["Reference"] = dfOne["Reference"].astype(str)
dfOne.dropna(inplace=True) #Drop rows with NaN
ext = ".docx"
for root, dirs, files in os.walk("applicantsCVs"):
files = r"\b" + "|".join(os.path.splitext(i)[0] for i in files) + r"\b"
for email, ref in dfOne[dfOne['Reference'].astype(str).str.contains(files, regex=True)].values:
os.rename(os.path.join(root, ref+ext), os.path.join(root, email+ext))
You could do it directly in your dataframe using df.apply():
import glob
import os.path
#Filter out null addresses
df = df.dropna(subset=['EmailAddress'])
#Add a column to check if file exists
df2['Existing_file'] = df2.apply(lambda row: glob.glob("applicantsCVs/{}.*".format(row['Reference'])), axis=1)
df2.apply(lambda row: os.rename(row.Existing_file[0], 'applicantsCVs/{}.{}'.format( row.EmailAddress, row.Existing_file[0].split('.')[-1])) if len(row.Existing_file) else None, axis = 1)
print(df2.Existing_file.map(len), "existing files renamed")
EDIT :
works now with any extension (.doc, .docx) by using glob module
Let consider our sample data in excel sheet is following:
Reference EmailAddress
1123 bob.smith#yahoo.com
1233 john.drako#gmail.com
1334 samuel.manuel#yahoo.com
nan python#gmail.com
There are following steps involved to solve this problem.
Step 1
import the data properly from excel sheet "my.xlsx". Here I am using the sample data
import pandas as pd
import os
#import data from excel sheet and drop rows with nan
df = pd.read_excel('my.xlsx').dropna()
#check the head of data if the data is in desirable format
df.head()
You will see that the data type in the references are in float type here
Step 2
Change the data type in the reference column to integer and then into string
df['Reference']=df.Reference.astype(int, inplace=True)
df = df.astype(str,inplace=True)
df.head()
Now the data is in desirable format
Step 3
Renaming the files in the desired folder. Zip the lists of 'Reference' and 'EmailAddress' to use in for loop.
#absolute path to folder. I consider you have the folder "application cv" in the home directory
path_to_files='/home/applicant cv/'
for ref,email in zip(list(df['Reference']),list(df['EmailAddress'])):
try:
os.rename(path_to_files+ref+'.doc',path_to_files+email+'.doc')
except:
print ("File name doesn't exist in the list, I am leaving it as it is")
Step 1: import the data from excel sheet "Book1.xlsx"
import pandas as pd
df = pd.read_excel (r'path of your file here\Book1.xlsx')
print (df)
Step 2: Choose path that your ".docx" files are in and store their names.
Get only relevent part of filename to compare.
mypath = r'path of docx files\doc files'
from os import listdir,rename
from os.path import isfile, join
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
#print(onlyfiles)
currentfilename=onlyfiles[0].split(".")[0]
This is how I stored the files
Step 3: Run loop to check if name matches with the Reference. And just use rename(src,dest) function from os
for i in range(3):
#print(currentfilename,df['ref'][i])
if str(currentfilename)==str(df['Reference'][i]):
corrosponding_email=df['EmailAddress'][i]
#print(mypath+"\\"+corrosponding_email)
rename(mypath+"\\"+str(currentfilename)+".docx",mypath+"\\"+corrosponding_email+".docx")
checkout the code with example:https://github.com/Vineet-Dhaimodker

Python: Collect all xlsx. - Open - Add Column - Save in new folder

I am new with Python but trying to write a code which add a column on multiple .xlsx files and saves this files with the origin name to a new folder.
I have started with some coding beneath, but missing some code in open all files and saving to my DestPath. Would be pleased if any has a solution for this:
from os import listdir, path
import pandas as pd
import xlrd
SourcePath = 'C:\' #Source Path
DestPath = 'C:\' #Destination Path
# Listing up all .xlsx files from Source
def find_xlsx_filenames( path_to_dir, suffix=".xlsx" ):
filenames = listdir(path_to_dir)
return [ filename for filename in filenames if filename.endswith( suffix ) ]
filenames = find_xlsx_filenames(SourcePath)
fname = path.join(SourcePath, filenames[0]) # Tar første fil i mappa.
outname = path.join(outputdata, filenames[0])
for i in range(len(filenames)):
fname = path.join(SourcePath, filenames[i])
df = pd.read_excel(fname) #Read Excel file as a DataFrame
df['new_col'] = 'Sort Data' #Adding a new column named <Sort Data>
#To save it back as Excel
df.to_excel(DestPath, outname) #Write DateFrame back as Excel file
Thanks in Advance
check if this works
import os
import pandas as pd
path = 'C:/'
for roots, dirs, files in os.walk(path):
xlsfile = [ _ for _ in files if _.endswith('.xlsx')]
for xlsf in xlsfile:
df = pd.read_excel(os.path.join(roots, xlsf))
df['Sort Data'] = ' '
df.to_excel(os.path.join(roots, xlsf), index = False)

Adding Multiple .xls files to a Single .xls file, using the file name to name tabs

I have multiple directories, each of which containing any number of .xls files.
I'd like to take the files in any given directory and combine them into one .xls file, using the file names as the tab names.
For example if there are the files NAME.xls, AGE.xls, LOCATION.xls, I'd like to combine them into a new file with the data from NAME.xls on a tab called NAME, the data from AGE.xls on a tab called AGE and so on.
Each source .xls file only has one column of data with no headers.
This is what I have so far, and well it's not working.
Any help would be greatly appreciated (I'm fairly new to Python and I've never had to do anything like this before).
wkbk = xlwt.Workbook()
xlsfiles = glob.glob(os.path.join(path, "*.xls"))
onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
tabNames = []
for OF in onlyfiles:
if str(OF)[-4:] == ".xls":
sheetName = str(OF)[:-4]
tabNames.append(sheetName)
else:
pass
for TN in tabNames:
outsheet = wkbk.add_sheet(str(TN))
data = pd.read_excel(path + "\\" + TN + ".xls", sheet_name="data")
data.to_excel(path + "\\" + "Combined" + ".xls", sheet_name = str(TN))
Here is a small helper function - it supports both .xls and .xlsx files:
import pandas as pd
try:
from pathlib import Path
except ImportError: # Python 2
from pathlib2 import Path
def merge_excel_files(dir_name, out_filename='result.xlsx', **kwargs):
p = Path(dir_name)
with pd.ExcelWriter(out_filename) as xls:
_ = [pd.read_excel(f, header=None, **kwargs)
.to_excel(xls, sheet_name=f.stem, index=False, header=None)
for f in p.glob('*.xls*')]
Usage:
merge_excel_files(r'D:\temp\xls_directory', 'd:/temp/out.xls')
merge_excel_files(r'D:\temp\xlsx_directory', 'd:/temp/out.xlsx')
Can you try
import pandas as pd
import glob
path = 'YourPath\ToYour\Files\\' # Note the \\ at the end
# Create a list with only .xls files
list_xls = glob.glob1(path,"*.xls")
# Create a writer for pandas
writer = pd.ExcelWriter(path + "Combined.xls", engine = 'xlwt')
# Loop on all the files
for xls_file in list_xls:
# Read the xls file and the sheet named data
df_data = pd.read_excel(io = path + xls_file, sheet_name="data")
# Are the sheet containing data in all your xls file named "data" ?
# Write the data into a sheet named after the file
df_data.to_excel(writer, sheet_name = xls_file[:-4])
# Save and close your Combined.xls
writer.save()
writer.close()
Let me know if it works for you, I never tried engine = 'xlwt' as I don't work with .xls file but .xlsx

xlrd to loop through multiple workbooks in a folder

I'm working on a script that pulls certain rows of data from multiple Excel workbooks in a folder (the critical sheet has the same name in every workbook). This code seems only to process/print results from the first file in the folder:
import os
import xlrd
for root, dirs, files in os.walk('/Users/123/Desktop/drivingtests'):
xlsfiles=[ _ for _ in files if _.endswith('.xlsx') ]
for xlsfile in xlsfiles:
workbook = xlrd.open_workbook(os.path.join(root,xlsfile))
worksheet = workbook.sheet_by_name('Sheet1')
for row in range(worksheet.nrows):
workbook = xlrd.open_workbook(os.path.join(root,xlsfile))
worksheet = workbook.sheet_by_name('Sheet1')
if worksheet.row_values(row)[0] == 'bike':
print worksheet.row_values(row)
What should be done to have the script process every workbook in the folder?
The answer is "indentions are important". When indented like the code below, it loops through all the files in the folder.
import os
import xlrd
for root, dirs, files in os.walk('/Users/123/Desktop/drivingtests'):
xlsfiles=[ _ for _ in files if _.endswith('.xlsx') ]
for xlsfile in xlsfiles:
workbook = xlrd.open_workbook(os.path.join(root,xlsfile))
worksheet = workbook.sheet_by_name('Sheet1')
for row in range(worksheet.nrows):
if worksheet.row_values(row)[0] == 'bike':
print worksheet.row_values(row)

Categories

Resources