I have 24 excel files, I'm aiming to copy the .xslx data and to their respective 24 .csv files. I have copied the data over however its creating 10 copies in the .csv files, I believe it has something to do with the for loops. Ive tried to use writerow() rather than writerows() yet that does help. I'm trying to understand openpyxl and its writer and reader objects.
import openpyxl, os, csv
from pathlib import Path
for excelFile in os.listdir('./excelspreadsheets'):
if excelFile.endswith('.xlsx'): # Skip non xlsx files, load the workbook object
wb = openpyxl.load_workbook('./excelspreadsheets/' + excelFile)
for sheetName in wb.sheetnames:
# Loop through every sheet in the workbook
sheet = wb[sheetName]
sheetTitle = sheet.title
# Create the CSV filename from the Excel filename and sheet title
p = Path(excelFile)
excelFileStemName = p.stem
CsvFilename = excelFileStemName + '_' + sheetTitle + '.csv'
# Create the csv.writer object for this CSV file
print(f'Creating filename {CsvFilename}...')
outputFile = open(CsvFilename, 'w', newline='')
outputWriter = csv.writer(outputFile)
# Create reader object for each excel sheet
fileObj = open('./excelspreadsheets/' + excelFile)
fileReaderObj = csv.reader(fileObj)
# Loop through every row in the excel sheet
for rowNum in range(1, sheet.max_row + 1):
rowData = [] # append each cell to this list
# Loop through each cell in the row
for colNum in range(1, sheet.max_column + 1):
rowData.append(sheet.values)
# write the rowData list to the CSV file.
for row in rowData:
outputWriter.writerows(row)
outputFile.close()
So, each of the newly created .csv files writes the correct data but does it 10 times, rather than once.
Appreciate any feedback thanks.
You can use read_excel and to_csv, which come as part of pandas to read excel file and write the data to csv file. It is just simpler from coding perspective, as the read and write will be done in one line. It also uses Openpyxl underneath. The updated code is below.
import openpyxl, os, csv
from pathlib import Path
import pandas as pd
for excelFile in os.listdir('./excelspreadsheets'):
if excelFile.endswith('.xlsx'): # Skip non xlsx files, load the workbook object
xls = pd.ExcelFile('./excelspreadsheets/' + excelFile)
for sheetname in xls.sheet_names:
#Read each sheet into df
df = pd.read_excel('./excelspreadsheets/' + excelFile, sheetname)
#Remove .xlsx from filename and create CSV name
CsvFilename = excelFile.rstrip('.xlsx') + '_' + sheetname + '.csv'
print(f'Creating filename {CsvFilename}...')
#Write df as CSV to file
df.to_csv(CsvFilename, index=False)
Let me know if you see any errors...
I am trying to merge multiple .xls files that have many columns, but 1 column with hyperlinks. I try to do this with Python but keep running into unsolvable errors.
Just to be concise, the hyperlinks are hidden under a text section. The following ctrl-click hyperlink is an example of what I encounter in the .xls files: ES2866911 (T3).
In order to improve reproducibility, I have added .xls1 and .xls2 samples below.
xls1:
Title
Publication_Number
P_A
ES2866911 (T3)
P_B
EP3887362 (A1)
.xls2:
Title
Publication_Number
P_C
AR118706 (A2)
P_D
ES2867600 (T3)
Desired outcome:
Title
Publication_Number
P_A
ES2866911 (T3)
P_B
EP3887362 (A1)
P_C
AR118706 (A2)
P_D
ES2867600 (T3)
I am unable to get .xls file into Python without losing formatting or losing hyperlinks. In addition I am unable to convert .xls files to .xlsx. I have no possibility to acquire the .xls files in .xlsx format. Below I briefly summarize what I have tried:
1.) Reading with pandas was my first attempt. Easy to do, but all hyperlinks are lost in PD, furthermore all formatting from original file is lost.
2.) Reading .xls files with openpyxl.load
InvalidFileException: openpyxl does not support the old .xls file format, please use xlrd to read this file, or convert it to the more recent .xlsx file format.
3.) Converting .xls files to .xlsx
from xls2xlsx import XLS2XLSX
x2x = XLS2XLSX(input.file.xls)
wb = x2x.to_xlsx()
x2x.to_xlsx('output_file.xlsx')
TypeError: got invalid input value of type <class 'xml.etree.ElementTree.Element'>, expected string or Element
import pyexcel as p
p.save_book_as(file_name=input_file.xls, dest_file_name=export_file.xlsx)
TypeError: got invalid input value of type <class 'xml.etree.ElementTree.Element'>, expected string or Element
During handling of the above exception, another exception occurred:
StopIteration
4.) Even if we are able to read the .xls file with xlrd for example (meaning we will never be able to save the file as .xlsx, I can't even see the hyperlink:
import xlrd
wb = xlrd.open_workbook(file) # where vis.xls is your test file
ws = wb.sheet_by_name('Sheet1')
ws.cell(5, 1).value
'AR118706 (A2)' #Which is the name, not hyperlink
5.) I tried installing older versions of openpyxl==3.0.1 to overcome type error to no succes. I tried to open .xls file with openpyxl with xlrd engine, similar typerror "xml.entree.elementtree.element' error occured. I tried many ways to batch convert .xls files to .xlsx all with similar errors.
Obviously I can just open with excel and save as .xlsx but this defeats the entire purpose, and I can't do that for 100's of files.
You need to use xlrd library to read the hyperlinks properly, pandas to merge all data together and xlsxwriter to write the data properly.
Assuming all input files have same format, you can use below code.
# imports
import os
import xlrd
import xlsxwriter
import pandas as pd
# required functions
def load_excel_to_df(filepath, hyperlink_col):
book = xlrd.open_workbook(file_path)
sheet = book.sheet_by_index(0)
hyperlink_map = sheet.hyperlink_map
data = pd.read_excel(filepath)
hyperlink_col_index = list(data.columns).index(hyperlink_col)
required_links = [v.url_or_path for k, v in hyperlink_map.items() if k[1] == hyperlink_col_index]
data['hyperlinks'] = required_links
return data
# main code
# set required variables
input_data_dir = 'path/to/input/data/'
hyperlink_col = 'Publication_Number'
output_data_dir = 'path/to/output/data/'
output_filename = 'combined_data.xlsx'
# read and combine data
required_files = os.listdir(input_data_dir)
combined_data = pd.DataFrame()
for file in required_files:
curr_data = load_excel_to_df(data_dir + os.sep + file, hyperlink_col)
combined_data = combined_data.append(curr_data, sort=False, ignore_index=True)
cols = list(combined_data.columns)
m, n = combined_data.shape
hyperlink_col_index = cols.index(hyperlink_col)
# writing data
writer = pd.ExcelWriter(output_data_dir + os.sep + output_filename, engine='xlsxwriter')
combined_data[cols[:-1]].to_excel(writer, index=False, startrow=1, header=False) # last column contains hyperlinks
workbook = writer.book
worksheet = writer.sheets[list(workbook.sheetnames.keys())[0]]
for i, col in enumerate(cols[:-1]):
worksheet.write(0, i, col)
for i in range(m):
worksheet.write_url(i+1, hyperlink_col_index, combined_data.loc[i, cols[-1]], string=combined_data.loc[i, hyperlink_col])
writer.save()
References:
reading hyperlinks - https://stackoverflow.com/a/7057076/17256762
pandas to_excel header formatting - Remove default formatting in header when converting pandas DataFrame to excel sheet
writing hyperlinks with xlsxwriter - https://xlsxwriter.readthedocs.io/example_hyperlink.html
Without a clear reproducible example, the problem is not clear. Assume I have two files called tmp.xls and tmp2.xls containing dummy data as in the two screenshots below.
Then pandas can easily, load, concatenate, and convert to .xlsx format without loss of hyperlinks. Here is some demo code and the resulting file:
import pandas as pd
f1 = pd.read_excel('tmp.xls')
f2 = pd.read_excel('tmp2.xls')
f3 = pd.concat([f1, f2], ignore_index=True)
f3.to_excel('./f3.xlsx')
Inspired by #Kunal, I managed to write code that avoids using Pandas libraries. .xls files are read by xlrd, and written to a new excel file by xlwt. Hyperlinks are maintened, and output file was saved as .xlsx format:
import os
import xlwt
from xlrd import open_workbook
# read and combine data
directory = "random_directory"
required_files = os.listdir(directory)
#Define new file and sheet to get files into
new_file = xlwt.Workbook(encoding='utf-8', style_compression = 0)
new_sheet = new_file.add_sheet('Sheet1', cell_overwrite_ok = True)
#Initialize header row, can be done with any file
old_file = open_workbook(directory+"/"+required_files[0], formatting_info=True)
old_sheet = old_file.sheet_by_index(0)
for column in list(range(0, old_sheet.ncols)):
new_sheet.write(0, column, old_sheet.cell(0, column).value) #To create header row
#Add rows from all files present in folder
for file in required_files:
old_file = open_workbook(directory+"/"+file, formatting_info=True)
old_sheet = old_file.sheet_by_index(0) #Define old sheet
hyperlink_map = old_sheet.hyperlink_map #Create map of all hyperlinks
for row in range(1, old_sheet.nrows): #We need all rows except header row
if row-1 < len(hyperlink_map.items()): #Statement to ensure we do not go out of range on the lower side of hyperlink_map.items()
Row_depth=len(new_sheet._Worksheet__rows) #We need row depth to know where to add new row
for col in list(range(old_sheet.ncols)): #For every column we need to add row cell
if col is 1: #We need to make an exception for column 2 being the hyperlinked column
click=list(hyperlink_map.items())[row-1][1].url_or_path #define URL
new_sheet.write(Row_depth, col, xlwt.Formula('HYPERLINK("{}", "{}")'.format(click, old_sheet.cell(row, 1).value)))
else: #If not hyperlinked column
new_sheet.write(Row_depth, col, old_sheet.cell(row, col).value) #Write cell
new_file.save("random_directory/output_file.xlsx")
I assume the same as daedalus in terms of the excel files. Instead of pandas I use openpyxl to read and create a new excel file.
import openpyxl
wb1 = openpyxl.load_workbook('tmp.xlsx')
ws1 = wb.get_sheet_by_name('Sheet1')
wb2 = openpyxl.load_workbook('tmp2.xlsx')
ws2 = wb.get_sheet_by_name('Sheet1')
csvDict = {}
# Go through first sheet to find the hyperlinks and keys.
for (row in ws1.max_row):
hyperlink_dict[ws1.cell(row=row, column=1).value] =
[ws1.cell(row=row, column=2).hyperlink.target,
ws1.cell(row=row, column=2).value]
# Go Through second sheet to find hyperlinks and keys.
for (row in ws2.max_row):
hyperlink_dict[ws2.cell(row=row, column=1).value] =
[ws2.cell(row=row, column=2).hyperlink.target,
ws2.cell(row=row, column=2).value]
Now you have all the data so you can create a new workbook and save the values from the dict into it via opnenpyxl.
wb = Workbook(write_only=true)
ws = wb.create_sheet()
for irow in len(csvDict):
#use ws.append() to add the data from the csv.
wb.save('new_big_file.xlsx')
https://openpyxl.readthedocs.io/en/stable/optimized.html#write-only-mode
Here is my process:
Step 1: Open File1
Step 2: Load Sheet1
Step 3: Load OutputFile
Step 4: Create a new sheet in OutputFile
Step 5: Copy contents cell by cell from Step 2 to paste in the sheet created in Step 4
Step 6: Repeat the process 'n' number of times
I have created a Python script to achieve this but the program is insanely slow. Takes an hour to complete. Here is a snippet of the code that does the copying over.
import xlsxwriter as xlsx
import openpyxl as xl
for i in range (6,k):
#get the file location/name from source file
filename = sheet.cell_value(i,3)
#get the sheetname from the sheet read in the above statement
sheetname = sheet.cell_value(i,4)
#print the file name to verify
print(filename)
#get output sheet name
outputsheetname = sheet.cell_value(i,5)
#load the source workbook
wb1 = xl.load_workbook(filename=filename,data_only = True)
#get the index of sheet to be copied
wb1_sheet_index = wb1.sheetnames.index(sheetname)
#load the sheet
ws1 = wb1.worksheets[wb1_sheet_index]
#load the output workbook
wb2 = xl.load_workbook(filename=output_loc)
#create a new sheet in output workbook
ws2 = wb2.create_sheet(outputsheetname)
#print(ws2,":",outputsheetname)
for row in ws1:
for cell in row:
ws2[cell.coordinate].value = cell.value
wb2.save(output_loc)
wb2.save(output_loc)
The filename, sheetname and outputsheetname comes from a master excel sheet where I keep the file location and sheet names. I load this file before this loop.
Also, I want the contents of the cell to be copied. If the source sheet has any formula, I do not want that to be copied over. And if there is a value 500 in Cell A5, I want the value to be in cell A5 in the output sheet.
Maybe I am approaching this the wrong way. Any help is appreciated.
openpyxl is the slowest module to work with excel file. you can try doing it with xlwings or if you're okay to use any excel add-in here is the RDB Merge that you can prefer using it, it is comparetively fast and does work
I have over 50 workbooks that I want to combine into one workbook as 50 sheets, with formatting, coloring, filling, etc. still in tact.
This is what I tried:
import pandas as pd
import os
from openpyxl import load_workbook
from openpyxl import Workbook
path = "mypath"
directory = os.listdir(f'{path}')
files = [f for f in directory if f[-4:] == 'xlsx']
combined = Workbook()
ws = combined.active
for item in files:
wb = load_workbook(filename = f'{path}/{item}')
sheet = wb.sheetnames
data = pd.read_excel(f'{path}/{item}',sheet_name=f'{sheet[0]}')
data.to_excel(f'{path}/combined.xlsx',sheet_name=f'{sheet[0]}',header=None,index=None)
There were a couple issues with the result:
1. It overwrote the sheet each iteration, so the final workbook had 1 sheet, with information of only the last workbook.
2. The sheet did not retain the formatting
I'm essentially trying to copy each sheet into one workbook as I would using Excel's copy sheet command without having to do it 50 times.
I have multiple sheets in one Excel workbook with duplicated columns in each sheet. I need to delete the duplicates and to leave the original columns only.
I know how to drop duplicates within a sheet.
df_sheet_map['> Acute Hospital Bed SLM']
result2=df_sheet_map['> Acute Hospital Bed SLM'].T.drop_duplicates().T
dfList = []
path = 'J:/TestDup'
newpath = 'J:/TestDup/Test2'
for fn in os.listdir(path):
file = os.path.join(path, fn)
if os.path.isfile(file):
# Import the excel file and call it xlsx_file
xlsx_file = pd.ExcelFile(file)
# View the excel files sheet names
xlsx_file.sheet_names
# Load the xlsx files Data sheet as a dataframe
df = xlsx_file.parse('Sheet1',header= None)
df_NoHeader = df[2:]
data = df_NoHeader
# Save individual dataframe
data.to_excel(os.path.join(newpath, fn))
dfList.append(data)
appended_data = pd.concat(dfList)
appended_data.to_excel(os.path.join(newpath, 'master_data.xlsx'))
The above code is working. However, I need to traverse all sheets. Further, it shows to delete first two rows, I need to change to delete duplicates.
#Transpose all sheets in a workbook. then delete duplicates. then Transpose back to original file and save all sheets
#Transpose all sheets in the workbook file
import pyexcel
import pyexcel_xlsx as pe
from pyexcel_xlsx import get_data
book = pyexcel.get_book(file_name="H:/SLM_Final/SLM Indicator template Main to clean.xlsx")
for sheet in book:
sheet.transpose()
pass
book.save_as("H:/SLM_Final/SLM Indicator template Main to clean.xlsx")
#run excel VB from python
import win32com.client as win32
import time
xl = win32.Dispatch('Excel.Application')
xl.Visible = 0
ss = xl.Workbooks.Open('H:/SLM_Final/DeleteDup.xlsm')
xl.Run("deleteDuplicate")
time.sleep(30)
xl.Quit()
time.sleep(30)
#VB syntax to add on excel workbook
'''Sub deleteDuplicate()
Dim ws As Worksheet
Dim wkbk1 As Workbook
Dim w As Long
Dim lRow As Long
Dim iCntr As Long
Set wkbk1 = Workbooks.Open("H:/SLM_Final/SLM Indicator template Main to clean.xlsx")
'Set wkbk1 = ThisWorkbook
wkbk1.Activate
With wkbk1
For w = 1 To .Worksheets.Count
With Worksheets(w)
.UsedRange.RemoveDuplicates Columns:=Array(3, 4), Header:=xlYes
End With
Next w
End With
wkbk1.Save
wkbk1.Close
End Sub''''
#
#Transpose files back to the original shape
import pyexcel
import pyexcel_xlsx as pe
from pyexcel_xlsx import get_data
book = pyexcel.get_book(file_name="H:/SLM_Final/SLM Indicator template Main to clean.xlsx")
for sheet in book:
sheet.transpose()
#sheet.delete_duplicates(keep=False, inplace=False)
pass
book.save_as("H:/SLM_Final/SLM Indicator template Main to clean.xlsx")
I hope this will help.