Convert openpyxl to pandas - python

After doing some research I understood that pandas can give us better performace comparing to openpyxl.
I'm trying to convert this code from openpyxl to pandas:
def get_restaurant_data(self, res: Restaurant, sf_id: int) -> RestaurantSchema:
sheet = self.workbook["VendorInfo"]
for row in sheet.iter_rows(min_row=2):
if row[0].value == sf_id:
res.address = self.check_data(row[1].value, res.address)
if self.check_data(row[2].value) and self.check_data(row[3].value):
res.loc = Point(row[3].value, row[2].value)
res.phone = self.check_data(row[4].value, res.manager_phone)
logo = self.download_file(row[5].value)
cover = self.download_file(row[6].value)
if logo:
res.logo.save(self.check_data(row[5].value), logo, save=False)
if cover:
res.cover.save(self.check_data(row[6].value), cover, save=False)
return RestaurantSchema(**res.__dict__)
Anyone can give me a suggestion about how to convert this code?
df = pd.read_excel(self.path,index_col=None)
print(f'>>>{df}')```

Here is the equivalent code using pandas:
import pandas as pd
def get_restaurant_data(self, res: Restaurant, sf_id: int) ->
RestaurantSchema:
sheet = self.workbook["VendorInfo"]
df = pd.DataFrame(sheet.values)
for i, row in df[1:].iterrows():
if row[0] == sf_id:
res.address = self.check_data(row[1], res.address)
if self.check_data(row[2]) and self.check_data(row[3]):
res.loc = Point(row[3], row[2])
res.phone = self.check_data(row[4], res.manager_phone)
logo = self.download_file(row[5])
cover = self.download_file(row[6])
if logo:
res.logo.save(self.check_data(row[5]), logo, save=False)
if cover:
res.cover.save(self.check_data(row[6]), cover, save=False)
return RestaurantSchema(**res.__dict__)
This code uses pandas to read the data from the "VendorInfo" sheet into a dataframe, and then uses the iterrows() method to iterate over the rows of the dataframe. The rest of the code is mostly the same as the original, with the exception of using the [] operator to access the values in each row instead of the value attribute.
I hope this helps!

Related

Trouble with merging cells with openpyxl

I have a problem with merging cells in python:
def merge_cells_if_value(cell, cell_row, sheet, row_with_names_index, nested_row_index):
pre_up_cell_row = cell_row - 1
cell_column_letter = COLUMN_LETTERS[cell.column - 1]
pre_up_cell_coords = f'{cell_column_letter}{pre_up_cell_row}'
cur_cell_coords = f'{cell_column_letter}{cell.row}'
if sheet[pre_up_cell_coords].value is not None or pre_up_cell_row == nested_row_index:
if pre_up_cell_row != row_with_names_index:
if sheet[pre_up_cell_coords].value is None:
sheet[f'{pre_up_cell_coords}'] = ''
print(cell, pre_up_cell_coords, sheet[pre_up_cell_coords].value, nested_row_index)
sheet.merge_cells(
f'{pre_up_cell_coords}:'
f'{cur_cell_coords}'
)
target_cell = sheet[f'{pre_up_cell_coords}']
else:
target_cell = cell
make_cell_alignment(target_cell, wrap_text=True)
make_cell_border(target_cell)
else:
merge_cells_if_value(cell, pre_up_cell_row, sheet, row_with_names_index, nested_row_index)
`
I using library openpyxl and when open excel file sample.xlsx I get the next error:
"We found a problem with some content in 'filename.xlsx'. Do you want us to try to recover as much as we can?
if you trust the source of this workbook, click Yes".
Removed Records: Merge cells from /xl/worksheets/sheet0.xml part
I tried to fill all cells with, at least, empty string (''), and I know, that there isn't any cell with None value

Conditional_formatting to find duplicate values using openpyxl

I am trying to have a class highlight any duplicate values in column A using openpyxl.
Currently column A has the following values:
A
A
B
C
A
C
A
The end result would have all of the A's and C's cells colored in red. My code below doesn't throw any errors, but the file when opened after being ran through this doesn't have the any coloring in the duplicated cells.
import openpyxl
from openpyxl import formatting, styles
from openpyxl.formatting import Rule
class Duplicates():
def __init__(self, wb2):
self.wb2 = wb2
ws2=self.wb2.active
self.red_fill = styles.PatternFill(start_color ='ffc7ce', end_color = 'ffc7ce', fill_type = 'solid')
dxf= styles.differential.DifferentialStyle(fill=self.red_fill)
rule = Rule(type='duplicateValues',dxf=dxf,stopIfTrue = None)
ws2.conditional_formatting.add('$A:$A',rule)
self.wb2.save('testing.xlsx')
Duplicates(wb2)
Any help would be much appreciated.
Try this code, it work for me
red_text = Font(color="9C0006")
red_fill = PatternFill(bgColor="FFC7CE")
dxf = DifferentialStyle(font=red_text, fill=red_fill)
rule = Rule(type="duplicateValues", text="highlight", dxf=dxf)
wsRes.conditional_formatting.add('B1:B10000', rule)

Script keep showing "SettingCopyWithWarning'

Hello my problem is that my script keep showing below message
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
downcast=downcast
I Searched the google for a while regarding this, and it seems like my code is somehow
assigning sliced dataframe to new variable, which is problematic.
The problem is ** I can't find where my code get problematic **
I tried copy function, or seperated the nested functions, but it is not working
I attached my code below.
def case_sorting(file_get, col_get, methods_get, operator_get, value_get):
ops = {">": gt, "<": lt}
col_get = str(col_get)
value_get = int(value_get)
if methods_get is "|x|":
new_file = file_get[ops[operator_get](file_get[col_get], value_get)]
else:
new_file = file_get[ops[operator_get](file_get[col_get], np.percentile(file_get[col_get], value_get))]
return new_file
Basically what i was about to do was to make flask api that gets excel file as an input, and returns the csv file with some filtering. So I defined some functions first.
def get_brandlist(df_input, brand_input):
if brand_input == "default":
final_list = (pd.unique(df_input["브랜드"])).tolist()
else:
final_list = brand_input.split("/")
if '브랜드' in final_list:
final_list.remove('브랜드')
final_list = [x for x in final_list if str(x) != 'nan']
return final_list
Then I defined the main function
def select_bestitem(df_data, brand_name, col_name, methods, operator, value):
# // 2-1 // to remove unnecessary rows and columns with na values
df_data = df_data.dropna(axis=0 & 1, how='all')
df_data.fillna(method='pad', inplace=True)
# // 2-2 // iterate over all rows to find which row contains brand value
default_number = 0
for row in df_data.itertuples():
if '브랜드' in row:
df_data.columns = df_data.iloc[default_number, :]
break
else:
default_number = default_number + 1
# // 2-3 // create the list contains all the target brand names
brand_list = get_brandlist(df_input=df_data, brand_input=brand_name)
# // 2-4 // subset the target brand into another dataframe
df_data_refined = df_data[df_data.iloc[:, 1].isin(brand_list)]
# // 2-5 // split the dataframe based on the "brand name", and apply the input condition
df_per_brand = {}
df_per_brand_modified = {}
for brand_each in brand_list:
df_per_brand[brand_each] = df_data_refined[df_data_refined['브랜드'] == brand_each]
file = df_per_brand[brand_each].copy()
df_per_brand_modified[brand_each] = case_sorting(file_get=file, col_get=col_name, methods_get=methods,
operator_get=operator, value_get=value)
# // 2-6 // merge all the remaining dataframe
df_merged = pd.DataFrame()
for brand_each in brand_list:
df_merged = df_merged.append(df_per_brand_modified[brand_each], ignore_index=True)
final_df = df_merged.to_csv(index=False, sep=',', encoding='utf-8')
return final_df
And I am gonna import this function in my app.py later
I am quite new to all the coding, therefore really really sorry if my code is quite hard to understand, but I just really wanted to get rid of this annoying warning message. Thanks for help in advance :)

Pandas read (Excel) columns of texts, and return similarity ratio

Excel columns as below. I want to check the max similarity ratio of the content in column B, to those texts in column A.
Column A has several strings seperated by "; "
Column B has 1 string only
Here is what I come up with xlrd and xlwt.
import xlwt, xlrd
from difflib import SequenceMatcher
workbook = xlrd.open_workbook("C:\\file.xlsx")
old_sheet = workbook.sheet_by_index(0)
book = xlwt.Workbook(encoding='cp1252', style_compression = 0)
sheet = book.add_sheet('Sheet1', cell_overwrite_ok = True)
for row_index in range(0, old_sheet.nrows):
new_list = []
Cell_a = old_sheet.cell(row_index, 0).value
Cell_b = old_sheet.cell(row_index, 1).value
Cell_a_list = Cell_a.split("; ")
ratio_list = []
for each in Cell_a_list:
ratio = SequenceMatcher(None, each, Cell_b).ratio()
ratio_list.append(ratio)
Cell_c = max(ratio_list)
sheet.write(row_index, 0, Cell_a)
sheet.write(row_index, 1, Cell_b)
sheet.write(row_index, 2, Cell_c)
book.save("C:\\file-1.xls")
How does the Pandas way looked like, in addition to below? Thank you.
import pandas as pd
data = {'Column_a' : ["Spaghetti, BL; Pasta, without eggs, WKB; Pasta, without eggs, BL; Pasta, with eggs, WKB",
"Noodles, instant, portion pack, WKB; Vermicelli (Angel Hair), BL; Beef, fillet, tenderloin (H2)",
"Beef, center brisket (B2); Beef, center brisket, with bones (B2); Beef, Silverside (F2a); Beef, Sirloin steak (H1)",
"Beef, minced; Beef/pork, minced; Veal, breast (D1), with bones; Veal, schnitzel/escalope (A5)",
"Pork, fillet, tenderloin (B); Pork, schnitzel/escalope (AA)"],
'Column_b' : ["Fresh tortellini or ravioli, WKB",
"Beef, rumpsteak (H3)",
"Beef, shreds or dices (H3, F)",
"Veal, loin (B2)",
"Pork, schnitzel/escalope (A)"]}
df = pd.DataFrame(data)
In pandas you can directly read excel (docs: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html)
Suppose you read your excel to get a dataframe df with columns A and B. Then you can simply write:
def calc_ratio(a,b):
return max([SequenceMatcher(None, each, Cell_b).ratio() for each in a.split("; ")])
df["c"] = df.apply(calc_ratio, axis=1)
For writing the output back to excel use df.to_excel. For a detailed documentation see here https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_excel.html

Apply borders to all cells in a range with openpyxl

I have a script that takes a pandas dataframe and chops it up into several hundred chunks and saves each chunk as a separate excel file. Each chunk will have the same number of columns but the number of rows varies. I've figured out how to apply all the other necessary formatting to these files with openpyxl, but I haven't yet determined the fastest way to apply borders. Also, I think I'm just not applying borders correctly, because the code below (which I suspect shouldn't need to loop over each cell individually) doesn't apply any borders.
from openpyxl.style import Border
wb = load_workbook(filename = _fname)
ws = wb.worksheets[0]
for _row in ws.range('A1:L'+str(ws.get_highest_row() ) ):
for _cell in _row:
_cell.style.borders.left.border_style = Border.BORDER_THIN
_cell.style.borders.right.border_style = Border.BORDER_THIN
_cell.style.borders.top.border_style = Border.BORDER_THIN
_cell.style.borders.bottom.border_style = Border.BORDER_THIN
wb.save(_fname)
So this code works, but it doesn't apply the border I expect (the default border in excel) and it takes a lot more steps than I'd prefer. My expectation is that I should be able to do something like this:
from openpyxl.style import Border
wb = load_workbook(filename = _fname)
ws = wb.worksheets[0]
_range = ws.some_range_func('A1:L'+str(ws.get_highest_row() ) ):
_range.style.borders.all_borders = Borders.BORDER_THIN
Does this functionality exist? If not, can someone please be so kind as to at least explain how to apply the default border style and not this slightly thicker border? None of Border.BORDER_THICK, Border.BORDER_MEDIUM, Border.BORDER_THIN, or Border.BORDER_HAIR seem correct.
Thanks!
In more pythonic way for openpyxl==3.0.5:
from openpyxl.styles import Border, Side
def set_border(ws, cell_range):
thin = Side(border_style="thin", color="000000")
for row in ws[cell_range]:
for cell in row:
cell.border = Border(top=thin, left=thin, right=thin, bottom=thin)
set_border(worksheet, 'A5:C10')
May be this is handy:
from openpyxl.reader.excel import load_workbook
from openpyxl.style import Border
def set_border(ws, cell_range):
rows = was[cell_range]
for row in rows:
row[0].style.borders.left.border_style = Border.BORDER_THIN
row[-1].style.borders.right.border_style = Border.BORDER_THIN
for c in rows[0]:
c.style.borders.top.border_style = Border.BORDER_THIN
for c in rows[-1]:
c.style.borders.bottom.border_style = Border.BORDER_THIN
#usage example:
ws = load_workbook('example.xlsx').get_active_sheet()
set_broder(ws, "C3:H10")
It performs reasonably fast.
There is a slight modification to answer from #Karimov
Below is how your code should be
from openpyxl.styles import Border, Side, Font, Alignment
def __format_ws__(self, ws, cell_range):
border = Border(left=Side(border_style='thin', color='000000'),
right=Side(border_style='thin', color='000000'),
top=Side(border_style='thin', color='000000'),
bottom=Side(border_style='thin', color='000000'))
rows = ws[cell_range]
for row in rows:
for cell in row:
cell.border = border
A much faster way that uses list comprehension is below:
def __format_ws__(self, ws, cell_range):
#applying border and alignment
font = Font(size=9)
align=Alignment(horizontal='left', vertical='center')
border = Border(left=Side(border_style='thin', color='000000'),
right=Side(border_style='thin', color='000000'),
top=Side(border_style='thin', color='000000'),
bottom=Side(border_style='thin', color='000000'))
rows = [rows for rows in ws[cell_range]]
flattened = [item for sublist in rows for item in sublist]
[(setattr(cell,'border',border), setattr(cell,'font',font), setattr(cell,'alignment',align)) for cell in flattened]
The way you use it is:
self.__format_ws__(ws=writer.book.worksheets[0], cell_range='A1:G10')
Decision that works on openpyxl 2.3.5
from openpyxl.styles import Border, Side
def set_border(ws, cell_range):
border = Border(left=Side(border_style='thin', color='000000'),
right=Side(border_style='thin', color='000000'),
top=Side(border_style='thin', color='000000'),
bottom=Side(border_style='thin', color='000000'))
rows = ws.iter_rows(cell_range)
for row in rows:
for cell in row:
cell.border = border
set_border(worksheet, 'A5:C10')
#user698585 your approach seems nice but it doesn't work anymore as the present version of the openpyxl change the implementation. So this should be updated into e.g.
ws.cell(row=1, column=1).style.border.top.border_style = borders.BORDER_MEDIUM
but it results with an error that changing the style is not allowed.
As a workaround I just defined a dedicated styles, but they are just a duplication of the present styles plus border definition - not so good solution as works only if you know what style has the cell under the change.
border_style = Style(font=Font(name='Console', size=10, bold=False,
color=Color(openpyxl.styles.colors.BLACK)),
fill=PatternFill(patternType='solid', fgColor=Color(rgb='00C5D9F1')),
border=Border(bottom=Side(border_style='medium', color=Color(rgb='FF000000'))))
if you need styling (borders...) for pandas excel dataframe my fork just got merged into master
https://github.com/pydata/pandas/pull/2370#issuecomment-10898427
as for you borders problems.
setting all borders at once does not seam to work in openpyxl.
In [34]: c.style.borders.all_borders.border_style = openpyxl.style.Border.BORDER_THIN
In [36]: c.style
'Calibri':11:False:False:False:False:'none':False:'FF000000':'none':0:'FFFFFFFF':'FF000000':'none':'FF000000':'none':'FF000000':'none':'FF000000':'none':'FF000000':'none':'FF000000':0:'thin':'FF000000':'none':'FF000000':'none':'FF000000':'none':'FF000000':'none':'FF000000':'general':'bottom':0:False:False:0:'General':0:'inherit':'inherit'
setting individually works ('thin':'FF000000')
In [37]: c.style.borders.top.border_style = openpyxl.style.Border.BORDER_THIN
In [38]: c.style
Out[38]: 'Calibri':11:False:False:False:False:'none':False:'FF000000':'none':0:'FFFFFFFF':'FF000000':'none':'FF000000':'none':'FF000000':'thin':'FF000000':'none':'FF000000':'none':'FF000000':0:'thin':'FF000000':'none':'FF000000':'none':'FF000000':'none':'FF000000':'none':'FF000000':'general':'bottom':0:False:False:0:'General':0:'inherit':'inherit'
maybe a bug in openpyxl. but no big deal just wrap setting bottom , top, left, right in function
Had the same issue but couldn't find anything that fixes this problem for 2019 because of depreciation. I have something that works below.. could be better but works for all intends and purposes.
def set_border(ws, cell_range):
rows = ws[cell_range]
for row in rows:
if row == rows[0][0] or row == rows[0][-1] or row == rows[-1][0] or row == rows[-1][-1]:
pass
else:
row[0].border = Border(left=Side(style='thin'))
row[-1].border = Border(right=Side(style='thin'))
for c in rows[0]:
c.border = Border(top=Side(style='thin'))
for c in rows[-1]:
c.border = Border(bottom=Side(style='thin'))
rows[0][0].border = Border(left=Side(style='thin'), top=Side(style='thin'))
rows[0][-1].border = Border(right=Side(style='thin'), top=Side(style='thin'))
rows[-1][0].border = Border(left=Side(style='thin'), bottom=Side(style='thin'))
rows[-1][-1].border = Border(right=Side(style='thin'), bottom=Side(style='thin'))
def set_border(ws, cell_range, style='thin'):
rows = ws[cell_range]
for row in rows:
temp_row = copy(row[0].border)
row[0].border = Border(left=Side(style=style), right=temp_row.right, top=temp_row.top, bottom=temp_row.bottom)
temp_row = copy(row[-1].border)
row[-1].border = Border(right=Side(style=style), left=temp_row.left, top=temp_row.top, bottom=temp_row.bottom)
for c in rows[0]:
temp_row = copy(c.border)
c.border = Border(top=Side(style=style), left=temp_row.left, bottom=temp_row.bottom, right=temp_row.right)
for c in rows[-1]:
temp_row = copy(c.border)
c.border = Border(bottom=Side(style=style), left=temp_row.left, top=temp_row.top, right=temp_row.right)
This keeps the existing borders of the side and you can also style your border
seems there is no built-in for this task, and we have to make some steps ourselves, like:
#need make conversion from alphabet to number due to range function
def A2N(s,e):
return range(ord(s), ord(e)+1)
#B1 is the border you defined
#Assume you trying border A1-Q1 ... A3-Q3
X = A2N('A','Q')
#print X
your_desired_sheet_range_rows = range(1,4)
#need two loop to go through cells
for row in your_desired_sheet_rows:
for col in X:
ca = chr(col)
sheet[ca+str(row)].border=B1

Categories

Resources