How to split merged cells in excel workbook using python - python

Is there any way to split/unmerge cells in excel workbook using python? What I want is explained below -
The result should a new excel file with following entries -
My solution using xlrd to copy the same string for all merged column is as given below -
[Note: "formatted_info = True" flag is not yet implemented in xlrd which I am using hence I cannot directly get the list of merged cells.. I am not supposed to upgrade xlrd on the setup.]
def xlsx_to_dict():
workbook = xlrd.open_workbook(xlsfile)
worksheet_names = workbook.sheet_names()
for worksheet_name in worksheet_names:
worksheet = workbook.sheet_by_name(worksheet_name)
num_rows = worksheet.nrows - 1
num_cells = worksheet.ncols - 1
curr_row = -1
header_row = worksheet.row(0)
columns = []
for cell in range(len(header_row)):
value = worksheet.cell_value(0, cell)
columns.append(value)
cities = []
for row in range(1,num_rows):
value = worksheet.cell_value(row,0)
type = worksheet.cell_type(row,0)
if not value == "":
cities.append(value)
names = []
for row in range(1,num_rows):
value = worksheet.cell_value(row,1)
type = worksheet.cell_type(row,1)
if not value == "":
names.append(value)
current_city = cities[0]
result_dict = {}
for curr_row in range(1,num_rows):
row = worksheet.row(curr_row)
curr_cell = -1
curr_name = names[0]
while curr_cell < num_cells:
curr_cell += 1
cell_value = worksheet.cell_value(curr_row, curr_cell)
if cell_value in cities and curr_cell == 0:
current_city = cell_value
if not result_dict.has_key(current_city):
result_dict[current_city] = {}
continue
if cell_value == "" and curr_cell == 0:
continue
if cell_value in names and curr_cell == 1:
curr_name = cell_value
if not result_dict[current_city].has_key(curr_name):
result_dict[current_city][curr_name] = {}
continue
if cell_value == "" and curr_cell == 1:
continue
try:
result_dict[current_city][curr_name]['Phone'].append(cell_Value)
except:
result_dict[current_city][curr_name]['Phone'] = [cell_value]
The above function will return python dictionary as below -
{ 'New York' : { 'Tom' : [92929292, 33929] }, ........}
I will then traverse the directory and write new excel.
However, I want some generic way of splitting merged cells.

This function gets the "real" cell value, i.e., the value of the merged cell if the coordinates are anywhere inside the merged cell.
def unmergedValue(rowx,colx,thesheet):
for crange in thesheet.merged_cells:
rlo, rhi, clo, chi = crange
if rowx in xrange(rlo, rhi):
if colx in xrange(clo, chi):
return thesheet.cell_value(rlo,clo)
#if you reached this point, it's not in any merged cells
return thesheet.cell_value(rowx,colx)
Loosely based on http://www.lexicon.net/sjmachin/xlrd.html#xlrd.Sheet.merged_cells-attribute
Very innefficient, but should be acceptable for small-ish spreadsheets.

if your file has no empty cells in the middle, this may help, read the file, do some job, rewrite it.
def read_merged_xls(file_contents):
book = xlrd.open_workbook(file_contents=file_contents)
data = []
sheet = book.sheet_by_index(0)
for rx in range(sheet.nrows):
line = []
for ry in range(sheet.ncols):
cell = sheet.cell_value(rx,ry)
if not cell:
cell = data[-1][ry] if data else ''
line.append(cell)
data.append(line)
return data

import xlrd
import xlsxwriter
import numpy as np
import pandas as pd
def rep(l,i):
j= i
while(j>=0):
if not l[j-1] == u'':
return l[j-1]
else:
j = j-1
def write_df2xlsx(df,filename):
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(filename,engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Sheet1', index = False)
# Close the Pandas Excel writer and output the Excel file.
writer.save()
def csv_from_excel(filename):
wb = xlrd.open_workbook(filename)
worksheet_names = wb.sheet_names()
for worksheet_name in worksheet_names:
sh = wb.sheet_by_name(worksheet_name)
#To find the headers/column names of the xlsx file
header_index = 0
for i in range(sh.nrows):
if(len(filter(lambda x: not (x.value == xlrd.empty_cell.value), sh.row(i))) == len(sh.row(i))):
header_row = sh.row(i)
header_index = i
break
columns = []
for cell in range(len(header_row)):
value = sh.cell_value(header_index, cell)
columns.append(value)
rows = []
for rownum in range(header_index+1,sh.nrows):
rows.append(sh.row_values(rownum))
data = pd.DataFrame(rows,columns = columns)
cols = [col for col in data.columns if u'' in list(data[col])]
res = []
for col in cols:
t_list = list(data[col])
res.append(map(lambda x,y: rep(list(data[col]),y[0]) if x == u'' else x,t_list,enumerate(t_list)))
for (col,r) in zip(cols,res):
data[col] = pd.core.series.Series(r)
write_df2xlsx(data,'ResultFile.xlsx')

Related

Python Code Excel Cell Color Check is not working

enter image description here
import openpyxl
def sum_gr_values(file_name):
# Load the workbook
workbook = openpyxl.load_workbook(file_name)
sheet = workbook.active
# Find the first yellow-filled cell
yellow_cell = None
for row in sheet.iter_rows():
for cell in row:
if cell.fill.start_color.rgb == '#FFFF00':
yellow_cell = cell
break
if yellow_cell:
break
# Set the start and end row variables
start_row = yellow_cell
end_row = start_row
# Find the last yellow-filled cell
for row in range(start_row + 1, sheet.max_row + 1):
if sheet.cell(row=row, column=yellow_cell.column).fill.start_color.index == 'FFFF00':
end_row = row
else:
break
# Check if start_row and end_row have values
if start_row and end_row:
# Initialize a variable to store the sum
total_sum = 0
# Sum the values in column "E" if the value in column "F" is "GR"
for row in range(start_row, end_row + 1):
if sheet.cell(row=row, column=6).value == "GR":
total_sum += sheet.cell(row=row, column=5).value
# Write the sum to the first yellow-filled cell in column "E"
yellow_cell.offset(0, -1).value = total_sum
# Save the changes to the file
workbook.save(file_name)
file_name = "file.xlsx"
sum_gr_values(file_name)
In the "# Find the first yellow-filled cell" section yellow cell is not dedected. I've also add the excel screnshot. Tried to debug but could not understand problem. I am new in coding so it might be an easy one.
It needs to detect the yellow rows.
You can use openpyxl Python library to parse an Excel document, iterate over its cells, and check the cell value, style, color, etc.
https://stackoverflow.com/questions/69410376/how-to-detect-color-of-excel-xlsx-row[enter link description here]1
Check the last 6 characters
if cell.fill.start_color.rgb[-6:] == 'FFFF00':
import openpyxl
from openpyxl.styles import PatternFill
def sum_gr_values(file_name):
# list of yellow cells
cellY = []
# Load the workbook
workbook = openpyxl.load_workbook(file_name)
sheet = workbook.active
# Find the yellow-filled rows
for r in sheet.iter_rows():
if r[0].fill.start_color.rgb[-6:] == 'FFFF00':
cellY.append(r[0])
#append last row
cellY.append(r[0].offset(1))
# check results
for n in range(0,len(cellY)):
print( "n: {} rowY: {}".format(n,cellY[n].row) )
print()
## Check have 2 rows
if len(cellY) < 2:
print ( "ERROR - No yellow rows found" )
exit()
# summate each pair of yellow lines
for n in range(0,len(cellY)-1):
# Initialize a variable to store the sum
total_sum = 0
r0 = cellY[n].row # start row
r1 = cellY[n+1].row # end row
print ( "n={} r0={} r1={}".format(n,r0,r1) )
## Sum the values in column "E" if the value in column "F" is "GR"
for rowno in range(r0+1, r1):
colE = sheet.cell(rowno, 5).value
colF = sheet.cell(rowno, 6).value
print (rowno,colE, colF)
if colF == "GR":
total_sum += colE
## Write the sum to the first yellow-filled cell in column "E"
sheet.cell(r0, 5).value = total_sum
sheet.cell(r0, 5).fill = PatternFill(start_color="FFFF00", fill_type = "solid")
print ( "total:{}\n".format(total_sum) )
# Save the changes to the file
workbook.save(file_name)
# program
file_name = "file.xlsx"
sum_gr_values(file_name)

Checking if a Cell has value, if it has jumps to the other and writes value

I am trying to do script that when exporting values to sheet if it checks that cell has values it jumps to another row. I can't seem to make it work. The output will always be 2 on the same cell
if ws1.cell(column=1, row=xrow).value is None:
sd = ws1.cell(column=1, row=xrow).value
ws1.cell(column=1, row=xrow).value = 2
else:
xrow = xrow + 1
ws1.cell(column=1, row=xrow).value = 2
wb.save(dest_filename)
Welcome to stackoverflow. #blaspas. What I have understood is that you are looking for an empty row to add data if not to just continue.
Below code works for that:
import openpyxl
wb = openpyxl.load_workbook("file.xlsx")
ws = wb.get_sheet_by_name('Sheet1')
ws=wb.active
max_row_val =ws.max_row
col=1
for rows in range(1, max_row_val + 1):
if ws.cell(rows, col).value == None:
sd = ws.cell(col, rows).value
ws.cell(rows,col).value = 2
elif ws.cell(rows, col).value != None:
ws.cell(rows, col).value = 2
rows=rows+1
wb.save("file.xlsx")

Load a apreadsheet and copy a row and pasted it in a different location

How can I copy a row for example from D51 to F51 and paste these values in the row T20 to AF20.
I know how to load a spreadsheet
workbook = load_workbook(output)
sheet = workbook.active
But I dont know how to itenarate in a loop to get this
sheet["T2"] = "=D6"
sheet["U2"] = "=E6"
sheet["V2"] = "=F6"
sheet["W2"] = "=G6"
sheet["X2"] = "=H6"
sheet["Y2"] = "=I6"
sheet["Z2"] = "=J6"
sheet["AA2"] = "=K6"
sheet["AB2"] = "=L6"
sheet["AC2"] = "=M6"
sheet["AD2"] = "=N6"
sheet["AE2"] = "=O6"
sheet["AF2"] = "=P6"
You can achieve this by using code below...
Note that the file output.xlsx is opened, updated and saved. The function num_to_excel_col is borrowed from here.
This will update columns 20 (T) onwards for the next 15 columns (all row 2) with the text as "=D6", "=E6", etc. The num_to_col function will convert the col number to equivalent excel string (for eg. 27 will be converted to AA, etc.)
import pandas as pd
import numpy as np
import openpyxl
workbook = openpyxl.load_workbook('output.xlsx')
ws = workbook.active
def num_to_excel_col(n):
if n < 1:
raise ValueError("Number must be positive")
result = ""
while True:
if n > 26:
n, r = divmod(n - 1, 26)
result = chr(r + ord('A')) + result
else:
return chr(n + ord('A') - 1) + result
outcol = 4 #Paste in col 'D'
for col in range(20,35): #col 20 is T and doing this for next 15 columns
txt = "="+num_to_excel_col(outcol)+"6"
print(txt)
ws.cell(row=2, column=col).value = txt
outcol += 1
workbook.save("output.xlsx")

How to generate new excel file for each iteration after looping the data from the input excel file

workbook = xlsxwriter.Workbook('ClaimSummaryReport.xlsx')
worksheet = workbook.add_worksheet()
format_1 = workbook.add_format({'num_format':''})
format_2 = workbook.add_format({'num_format':''})
# set the width
worksheet.set_column(0,0,90)
worksheet.set_column('B:B',80,format_2)
# start from where
row = 0
col = 0
for x in output:
worksheet.write(row,col, x, format_1 )
worksheet.write(row,col + 1,output[x],format_2)
row+=1
workbook.close()
This code is only generating one excel file. After each looping, it will overwrite the previous data.
You can just cram everything inside a for loop for a dirty fix:
# start from where
row = 0
col = 0
for idx, x in enumerate(output):
workbook = xlsxwriter.Workbook(f'ClaimSummaryReport{idx}.xlsx')
worksheet = workbook.add_worksheet()
format_1 = workbook.add_format({'num_format':''})
format_2 = workbook.add_format({'num_format':''})
# set the width
worksheet.set_column(0,0,90)
worksheet.set_column('B:B',80,format_2)
worksheet.write(row,col, x, format_1 )
worksheet.write(row,col + 1,output[x],format_2)
row+=1
workbook.close()

How to re-write a while statement as an if statement in Python 2.7?

I wrote a script that searches an excel document for 'X', and when it finds an 'X' it copies the first column and first row associated with the 'X' into a CSV file.
I've been told that there's a better way to do this with 'if' statements. Not quite sure how.
Here's the code:
import xlrd
import csv
###Grab the data from sheet 1
def get_row_values(workSheet, row):
to_return = []
num_cells = myWorksheet.ncols - 1
curr_cell = -1
while curr_cell < num_cells:
curr_cell += 1
cell_value = myWorksheet.cell_value(row, curr_cell)
to_return.append(cell_value)
return to_return
file_path = 'foo.xlsx'
output = []
#Write the data
myWorkbook = xlrd.open_workbook(file_path)
myWorksheet = myWorkbook.sheet_by_name('foosheet')
num_rows = myWorksheet.nrows - 1
curr_row = 0
column_names = get_row_values(myWorksheet, curr_row)
#print("TOTAL ENTRIES:")
#print len(column_names)
#print("-----")
framework_name = myWorksheet.cell(0,2)
framework_version = myWorksheet.cell(0,3)
while curr_row < num_rows:
curr_row += 1
row = myWorksheet.row(curr_row)
this_row = get_row_values(myWorksheet, curr_row)
x = 0
while x <len(this_row):
if this_row[x] == 'x':
output.append(['', fooname, foo_version,
foo_names[x], foo_row[0]])
myData = [["foo1", "foo2",
"foo3", "foo4", "foo5"]]
myFile = open('./results/barTemp.csv', 'w')
with myFile:
writer = csv.writer(myFile)
writer.writerows(myData)
writer.writerows(output)
x += 1
#print output
myFile.close()
myWorkbook.release_resources()
Its not necessarily better. Still the same runtime-complexity.
The difference would be a more compact line:
For example, you can change
while x < len(this_row):
to
for x in this_row:
but I see that you use the 'x' index to find column_names[x] so another approach might be better such as
for x in range(len(this_row)):

Categories

Resources