splitting an Excel workbook into multiple excel files - python

i have an excel workbook with 29 different sheets. i used the following code to save each sheet as an individual excel file:
from xlrd import open_workbook
from xlwt import Workbook
rb = open_workbook('c:\\original file.xls',formatting_info=True)
for a in range(5): #for example there're only 5 tabs/sheets
rs = rb.sheet_by_index(a)
new_book = Workbook()
new_sheet = new_book.add_sheet('Sheet 1')
for row in range(rs.nrows):
for col in range(rs.ncols):
new_sheet.write(row, col, rs.cell(row, col).value)
new_book.save("c:\\" + str(a) + ".xls")
i got this code from: stackoverflow.com/questions/28873252/python-splitting-an-excel-workbook. it worked well but is there a way i could save the workbooks by sheet name. so the sheet name should be what the file is called. i tried replacing
new_book.save("c:\\" + str(a) + ".xls")
with
new_book.save(sheet.names + str(a) + ".xls")
But it didnt work

If I understand your requirement correctly.
You can use pandas with pd.ExcelFile and read the whole workbook as a dictionary.
import pandas as pd
xl = pd.ExcelFile('c:\\original file.xls')
for sheet in xl.sheet_names:
df = pd.read_excel(xl,sheet_name=sheet)
df.to_excel(f"{sheet}.xls",index=False)

Related

How to copy multiple .xlsx files into a respective .csv file?

I have 24 excel files, I'm aiming to copy the .xslx data and to their respective 24 .csv files. I have copied the data over however its creating 10 copies in the .csv files, I believe it has something to do with the for loops. Ive tried to use writerow() rather than writerows() yet that does help. I'm trying to understand openpyxl and its writer and reader objects.
import openpyxl, os, csv
from pathlib import Path
for excelFile in os.listdir('./excelspreadsheets'):
if excelFile.endswith('.xlsx'): # Skip non xlsx files, load the workbook object
wb = openpyxl.load_workbook('./excelspreadsheets/' + excelFile)
for sheetName in wb.sheetnames:
# Loop through every sheet in the workbook
sheet = wb[sheetName]
sheetTitle = sheet.title
# Create the CSV filename from the Excel filename and sheet title
p = Path(excelFile)
excelFileStemName = p.stem
CsvFilename = excelFileStemName + '_' + sheetTitle + '.csv'
# Create the csv.writer object for this CSV file
print(f'Creating filename {CsvFilename}...')
outputFile = open(CsvFilename, 'w', newline='')
outputWriter = csv.writer(outputFile)
# Create reader object for each excel sheet
fileObj = open('./excelspreadsheets/' + excelFile)
fileReaderObj = csv.reader(fileObj)
# Loop through every row in the excel sheet
for rowNum in range(1, sheet.max_row + 1):
rowData = [] # append each cell to this list
# Loop through each cell in the row
for colNum in range(1, sheet.max_column + 1):
rowData.append(sheet.values)
# write the rowData list to the CSV file.
for row in rowData:
outputWriter.writerows(row)
outputFile.close()
So, each of the newly created .csv files writes the correct data but does it 10 times, rather than once.
Appreciate any feedback thanks.
You can use read_excel and to_csv, which come as part of pandas to read excel file and write the data to csv file. It is just simpler from coding perspective, as the read and write will be done in one line. It also uses Openpyxl underneath. The updated code is below.
import openpyxl, os, csv
from pathlib import Path
import pandas as pd
for excelFile in os.listdir('./excelspreadsheets'):
if excelFile.endswith('.xlsx'): # Skip non xlsx files, load the workbook object
xls = pd.ExcelFile('./excelspreadsheets/' + excelFile)
for sheetname in xls.sheet_names:
#Read each sheet into df
df = pd.read_excel('./excelspreadsheets/' + excelFile, sheetname)
#Remove .xlsx from filename and create CSV name
CsvFilename = excelFile.rstrip('.xlsx') + '_' + sheetname + '.csv'
print(f'Creating filename {CsvFilename}...')
#Write df as CSV to file
df.to_csv(CsvFilename, index=False)
Let me know if you see any errors...

I want to get some data from different sheet of the same file python

I want to get some data of a single file but from different sheet i tried the code below but it give only the data of the first sheet
from openpyxl import load_workbook
work = load_workbook(filename=r'the name of the file.xlsx',data_only=True)
for sheet in work.sheetnames[1:len(work.sheetnames)]:
n = 3
sheet1 = work[work.sheetnames[n]]
for val in sheet1.iter_rows(min_row=9, max_row=14, min_col=6, max_col=8, values_only=True):
print(str(sheet) + " " + str(val))
n += 1
Assume we have an .xlsx file with 5 sheets named sheet0, sheet1, sheet2 ..., sheet5
work.sheetnames gives you a list of all sheets ['sheet0', 'sheet1', 'sheet2', 'sheet3', ..., 'sheet5']
When you use:
for sheet in work.sheetnames[1:len(work.sheetnames)]:
means you are going to do your job from sheet1 to sheet5
After then you have:
n = 3
sheet1 = work[work.sheetnames[n]]
work.sheetnames[3] returns the string 'sheet3', so variable sheet1 is always work['sheet3'] in every iteration. I think this is why your result does not act the way you wish.
The code below probably can fix the problem.
from openpyxl import load_workbook
work = load_workbook(filename=r'the name of the file.xlsx',data_only=True)
for sheet in work.sheetnames[1:len(work.sheetnames)]:
sheet1 = work[sheet]
for val in sheet1.iter_rows(min_row=9, max_row=14, min_col=6, max_col=8, values_only=True):
print(str(sheet) + " " + str(val))

Combine multiple excel workbooks into one with multiple sheets

I have over 50 workbooks that I want to combine into one workbook as 50 sheets, with formatting, coloring, filling, etc. still in tact.
This is what I tried:
import pandas as pd
import os
from openpyxl import load_workbook
from openpyxl import Workbook
path = "mypath"
directory = os.listdir(f'{path}')
files = [f for f in directory if f[-4:] == 'xlsx']
combined = Workbook()
ws = combined.active
for item in files:
wb = load_workbook(filename = f'{path}/{item}')
sheet = wb.sheetnames
data = pd.read_excel(f'{path}/{item}',sheet_name=f'{sheet[0]}')
data.to_excel(f'{path}/combined.xlsx',sheet_name=f'{sheet[0]}',header=None,index=None)
There were a couple issues with the result:
1. It overwrote the sheet each iteration, so the final workbook had 1 sheet, with information of only the last workbook.
2. The sheet did not retain the formatting
I'm essentially trying to copy each sheet into one workbook as I would using Excel's copy sheet command without having to do it 50 times.

Converting multiple xls files to xlsx- issues with scaling up from single file

We have a few thousand xls files, with dozens of sheets in each file. We are working on a larger project to combine the files and sheets, but first need to convert them to xlsx.
The following code works fine on a single file:
import xlrd
from openpyxl.workbook import Workbook as openpyxlWorkbook
xlsBook = xlrd.open_workbook(C://path)
workbook = openpyxlWorkbook()
for i in xrange(0, xlsBook.nsheets):
xlsSheet = xlsBook.sheet_by_index(i)
sheet = workbook.active if i == 0 else workbook.create_sheet()
sheet.title = xlsSheet.name
for row in xrange(0, xlsSheet.nrows):
for col in xrange(0, xlsSheet.ncols):
sheet.cell(row=row+1, column=col+1).value = xlsSheet.cell_value(row, col)
workbook.save(c://path/workbook.xlsx")
This works perfectly.
When attempting to loop through all files, we use:
import xlrd
from openpyxl.workbook import Workbook as openpyxlWorkbook
import glob
import pandas as pd
from pandas import ExcelWriter
import os
path ="C://path"
path2 = "C://path2"
allFiles = glob.glob(path + "/*.xls")
for file_ in allFiles:
xlsBook = xlrd.open_workbook(file_)
workbook = openpyxlWorkbook()
for i in xrange(0, xlsBook.nsheets):
xlsSheet = xlsBook.sheet_by_index(i)
sheet = workbook.active if i == 0 else workbook.create_sheet()
sheet.title = xlsSheet.name
for row in xrange(0, xlsSheet.nrows):
for col in xrange(0, xlsSheet.ncols):
sheet.cell(row=row+1, column=col+1).value = xlsSheet.cell_value(row, col)
##workbook.save(os.path.join(path2,file_))
##workbook.to_excel(os.path.join(path2,file_))
workbook.save("C://path/workbook.xlsx")
For the first two commented out save methods, workbook.save seems to do absolutely nothing, and to_excel tells me workbook does not have a property called to_excel...is that because I didn't call pandas in the loop?
The final workbook.save was a test- I assumed it would save the final iteration of the loop correctly, since it worked in the script with just one file.
Instead, it creates the file, with all of the worksheets correctly named, but no data in any of the worksheets.
Any idea what I am missing? To be clear, I am looking to have each file named with its original filename at the end of the loop, and a valid xlsx extension.
I'd try this way instead. Simpler code and it worked when I tested it.
import pandas as pd
import glob
def converter(filename):
xl = pd.ExcelFile(filename) # reads file in
sheet_names = xl.sheet_names # gets the sheet names of the file
sheets_dict = {} # dictionary with sheet_names as keys and data as values
for sheet in sheet_names:
sheets_dict[sheet] = xl.parse(sheet)
writer = pd.ExcelWriter(r'C:\Users\you\Desktop\\' + filename.split('\\')[-1][:-4] + '.xlsx') # takes the file path and only returns the file name, now with format xlsx
for sheet_name, data in sheets_dict.iteritems():
data.to_excel(writer, sheet_name, index = False)
writer.save()
files = glob.glob(r'C:\Users\you\Desktop' + '\*.xls')
for file in files:
converter(file)
Edit: I'm not too familiar with openpyxl but I don't believe it has a .to_excel method. I think you were creating a openpyxl workbook but then trying to save it using a pandas method.

Python splitting an Excel workbook

I am finding a way to split an Excel workbook, contains multiple tabs/sheets, into multiple workbooks, according to the numbers of tabs/sheets the original workbook has:
Worked out:
from xlrd import open_workbook
from xlwt import Workbook
rb = open_workbook('c:\\original file.xls',formatting_info=True)
for a in range(5): #for example there're only 5 tabs/sheets
rs = rb.sheet_by_index(a)
new_book = Workbook()
new_sheet = new_book.add_sheet('Sheet 1')
for row in range(rs.nrows):
for col in range(rs.ncols):
new_sheet.write(row, col, rs.cell(row, col).value)
new_book.save("c:\\" + str(a) + ".xls")
This is actually nothing but reading the sheets one by one, and save them one by one. Is there a better, or more direct way?

Categories

Resources