I've written a script which copies data from one workbook to another. My only issue is that empty cells are being added between data. Can anyone understand why? It looks like the script is skipping values which don't meet the condition of the if statement, but still copying a blank cell.
from openpyxl import load_workbook
from openpyxl import Workbook
wb = load_workbook('testData.xlsx')
wb2 = load_workbook('testTemplate.xlsx')
ws = wb.worksheets[0]
mr = ws.max_row
ws2 = wb2.worksheets[0]
mr2 = ws2.max_row
for row in ws.iter_rows(min_row = 1, min_col = 1, max_row = mr, max_col = 3):
for cell in row:
if cell.value == "A":
ws2.cell(row = mr2 + 1, column = 1).value = (cell.offset(column = + 1).value)
mr2 += 1
elif cell.value == "B":
ws2.cell(row = mr2 + 1, column = 2).value = (cell.offset(column = + 1).value)
mr2 += 1
elif cell.value == "C":
ws2.cell(row = mr2 + 1, column = 3).value = (cell.offset(column = + 1).value)
mr2 += 1
wb2.save('testTemplate.xlsx')
Your issue:
Giving Enter(mr2 += 1) every time after entering value as it return in if condition
Solution:
Create separate counter for different columns and Enter(Counter += 1) when value in entered in that column
As per example:
A = ws2.max_row
if cell.value == "A":
ws2.cell(row = A + 1, column = 1).value = (cell.offset(column = + 1).value)
A += 1
Full Code:
from openpyxl import load_workbook
from openpyxl import Workbook
wb = load_workbook('testData.xlsx')
wb2 = load_workbook('testTemplate.xlsx')
ws = wb.worksheets[0]
mr = ws.max_row
ws2 = wb2.worksheets[0]
A = ws2.max_row
B = ws2.max_row
C = ws2.max_row
for row in ws.iter_rows(min_row = 2, min_col = 1, max_row = mr, max_col = 2):
for cell in row:
if cell.value == "A":
ws2.cell(row = A + 1, column = 1).value = (cell.offset(column = + 1).value)
A += 1
elif cell.value == "B":
ws2.cell(row = B + 1, column = 2).value = (cell.offset(column = + 1).value)
B +=1
elif cell.value == "C":
ws2.cell(row = C + 1, column = 3).value = (cell.offset(column = + 1).value)
C +=1
wb2.save('testTemplate.xlsx')
Related
I wrote a Python script, which inserts data into an Excel file. My problem is that my drop-down list disappears after every save. I do have multiple sheets in my Excel file and not only my current page, which I edit is affected, also all others too.
Here is my code:
def openExcelFile(path: str = None) -> openpyxl:
if path is None:
raise ValueError("path is None!")
return openpyxl.load_workbook(path)
def saveToExcelFile(wb_object: openpyxl = None, path: str = None) -> None:
if wb_object is None or path is None:
raise ValueError("wb_object or path is None!")
wb_object.save(path)
wb_object.close()
def findDuplicate(sheet_object, dataset: LandRegister = None, column: int = 0, row: int = 0, index: int = 0):
if index == 0 and row == 1:
return (-1, -1, -1)
duplicate_found = False
family_found = False
for i in range(0, index + 1):
cell_firstname = sheet_object.cell(row=row - i, column=column).value
cell_lastname = sheet_object.cell(row=row - i, column=column + 1).value
cell_address = sheet_object.cell(row=row - i, column=column + 2).value
cell_zipcode = sheet_object.cell(row=row - i, column=column + 3).value
if cell_lastname == dataset.lastname[index] and \
cell_address == dataset.address[index] and \
cell_zipcode == dataset.zipcode[index]:
if cell_firstname == dataset.firstname[index]:
return (-2, -2, -2)
return (row - 1, column, index)
if duplicate_found:
return (-2, -2, -2)
elif family_found:
return (row - 1, column, index)
else:
return (-1, -1, -1)
def insertNewRow(sheet, row, nr):
sheet.insert_rows(row, amount=1)
cell_object = sheet.cell(
row=row, column=2)
cell_object.value = nr
def insertData(sheet_object, dataset: LandRegister = None, column: int = 0, row: int = 0, index: int = 0):
# Insert the first name.
cell_object = sheet_object.cell(
row=row, column=column)
cell_object.value = dataset.firstname[index]
# Insert the last name.
cell_object = sheet_object.cell(
row=row, column=column + 1)
cell_object.value = dataset.lastname[index]
# Insert the location
cell_object = sheet_object.cell(
row=row, column=column + 2)
cell_object.value = dataset.address[index]
# Insert the zipcode
cell_object = sheet_object.cell(
row=row, column=column + 3)
tmp_zipcode = dataset.zipcode[index]
cell_object.value = tmp_zipcode
def insertMetaData(sheet_object, row: int = 0, column: int = 0):
kg_nr_p = sheet_object.cell(
row=row, column=column)
kg_name_p = sheet_object.cell(
row=row, column=column + 1)
grst_nr_p = sheet_object.cell(
row=row, column=column + 2)
cell_object = sheet_object.cell(
row=row+1, column=column)
cell_object.value = kg_nr_p.value
cell_object = sheet_object.cell(
row=row+1, column=column + 1)
cell_object.value = kg_name_p.value
cell_object = sheet_object.cell(
row=row+1, column=column + 2)
cell_object.value = grst_nr_p.value
def writeToExcelFile(wb_object, dataset: LandRegister = None, sheet_name: str = None, cell: str = 'A0'):
if dataset is None or sheet_name is None or wb_object is None:
raise ValueError("dataset, sheet_name or wb_object is None!")
start_row = SETTINGS[sheet_name]['row']
start_col = SETTINGS[sheet_name]['col']
sheet_names = wb_object.sheetnames
if sheet_name not in sheet_names:
raise ValueError("No matching sheet name found!")
sheet_object = wb_object[sheet_name]
row = 0
for index in range(len(dataset.firstname)):
found_row, found_colum, found_index = findDuplicate(
sheet_object, dataset, column=start_col, row=(row + start_row), index=index)
if found_colum == -1 and found_row == -1 and found_index == -1:
if index > 0:
cell_object = sheet_object.cell(
row=row + start_row-1, column=start_col-10)
insertNewRow(sheet_object, row + start_row, cell_object.value)
insertMetaData(sheet_object, row + start_row-1, start_col-9)
insertData(sheet_object, dataset,
column=start_col, row=(row + start_row), index=index)
row += 1
elif found_colum == -2 and found_row == -2 and found_index == -2:
continue
else:
cell_object = sheet_object.cell(
row=found_row, column=found_colum)
cell_object.value += " und " + dataset.firstname[index]
SETTINGS[sheet_name]['row'] = start_row + row
def loadSettings():
global SETTINGS
with open(os.path.join(DIRPATH, 'settings.json'), 'r') as file:
SETTINGS = json.load(file)
def saveSettings():
with open(os.path.join(DIRPATH, 'settings.json'), 'w') as file:
json.dump(SETTINGS, file)
def run(files: List[str], landregiser: LandRegister = None, extractor: ExtractorLandRegister = None) -> None:
path = os.path.join(DIRPATH, EXCEL_FILE_NAME)
excel_file = openExcelFile(path)
sheet_name = SETTINGS["Sheets"][SETTINGS["Source"]['4']]
main_logger.info(f'Insert in {sheet_name}')
counter = 1
for key in files:
print_message = f'==={files[key]:<2}===\n'
try:
txt = extractor.extractTextFromPdf(files[key])
landregiser.dataset = txt
landregiser.execute()
writeToExcelFile(excel_file, landregiser, sheet_name)
main_logger.info(
f'Counter {counter:<2}: | {files[key]:<75} | {len(landregiser.firstname):<2}')
counter += 1
except Exception as exce:
print(exce)
print_message += str(exce)
main_logger.error("Error")
counter += 1
SETTINGS[sheet_name]['row'] = SETTINGS[sheet_name]['row'] + 1
saveToExcelFile(excel_file, path)
# saveSettings()
def main():
loadSettings()
ex = ExtractorLandRegister(SETTINGS["Source"]['4'])
lr = LandRegister()
files = ex.getPdfFiles()
run(files, landregiser=lr, extractor=ex)
if __name__ == '__main__':
main()
Do I use openpyxl in a wrong way or does it not support to load an existing drop-down list?
Ok it is not working to load the Data-Validation into, but it is possible to insert the data and afterwards format everything accordingly.
To make a validation with openpyxl use this link to do so.
I'm trying to write a scipt that can update a column based on a transaction ID. Im using Python3, Openpyxl to read the excel file
In the above image, it would be to update the highlighted cells with the same value in column K, as they have the same transaction ID in column C. Then when it gets to C12, it updates column K with a different value as the value of C has changed...and so on and so on.
So far I have:
from openpyxl import load_workbook, Workbook
import re
wb = load_workbook(filename = 'Testing.xlsx')
ws = wb['Test']
for r in range(2, ws.max_row + 1):
column_c = ws.cell(row = r, column = 3).value
column_h = ws.cell(row = r, column = 8).value
column_i = ws.cell(row = r, column = 9).value
column_j = ws.cell(row = r, column = 10).value
previous = None
while (previous == column_c):
ws.cell(row = r, column = 11).value = column_j_formatted
if (previous != column_c):
continue
wb.save('Testing_processed.xlsx')
UPDATE
I have tried to replace the while loop with:
previous_col_c = ws.cell(row=r-1, column=3)
for row_num in range (2, ws.max_row + 1):
current_col_c = ws.cell(row=r, column=3)
current_col_j = ws.cell(row=r, column=11)
if current_col_c == previous_col_c:
ws.cell(row = r, column = 11).value = column_j_formatted
previous_col_c = current_col_c
Just to illustrate how the openpyxl API makes this kind of task very easy.
txn = None
filler = None
for row in ws.iter_rows(min_row=2):
a = row[0]
k = row[10]
if a.value != txn:
txn = a.value
filler = k.value
if not k.value:
k.value = filler
But really the work should be done in the source of the data, presumably a database.
I am trying to copy the values from some cells but it give me this error, i tried even without using the def cell(x,y) but still the same error.
This is the error:
learn_tar.cell(row=learn_tar, column=1).value = sheet.cell(row=learn_tar, column=1).value
AttributeError: 'int' object has no attribute 'cell'
Source:
import openpyxl
def cell(x,y):
cell = sheet.cell(row=x,column=y).value
return cell;
def percentage(percent, whole):
return int((percent * whole) / 100.0);
ex = openpyxl.load_workbook("Final_excel2.xlsx")
sheet = ex.get_sheet_by_name('Sheet1')
num = [0,0,0]
per = [0,0,0]
for row in range(2,4798):
if cell(row,1) == '1: Progression':
num[0] = num[0] + 1
elif cell(row,1) == '2: Incidence':
num[1] = num[1] + 1
elif cell(row,1) == '3: Non-exposed control group':
num[2] = num[2] + 1
for column in range(2,49):
#doing stuff
per[0] = percentage(70,num[0])
per[1] = percentage(70,num[1])
per[2] = percentage(70,num[2])
learn_att = ex.create_sheet('Learn-Att',2)
learn_tar = ex.create_sheet('Learn-Tar',3)
test_att = ex.create_sheet('Test-Att',4)
test_tar = ex.create_sheet('Test-Tar',5)
learn_att = 1
learn_tar = 1
test_att = 1
test_tar = 1
for row in range(2,4798):
if row<=1391:
if row<=974:
learn_tar.cell(row=learn_tar, column=1).value = cell(row,1)
learn_att+= 1
learn_tar+= 1
else:
test_tar.cell(row = test_tar,column = 1).value = cell(row,1)
test_att+= 1
test_tar+= 1
for column in range(2,49):
if row<=1391:
if row<=974:
learn_att.cell(row = learn_att,column = column - 1).value = cell(row,column)
else:
test_att.cell(row = test_att,column = column - 1).value = cell(row,column)
You override learn_tar with 1:
learn_tar = ex.create_sheet('Learn-Tar',3)
...
learn_tar = 1
Remove:
learn_tar = 1
and:
learn_tar+= 1
from your code.
The Line 2:13 is the existing data format which I want it in the format as dispalyed in line 16:19
The Code I have written is
import xlrd
import pandas as pd
book = xlrd.open_workbook(
"C:/Users/Vinod/Desktop/DataSet Mining/python-pandas.xlsx")
sheet = book.sheet_by_name('Sheet1')
df1 = pd.DataFrame()
n = sheet.nrows
print(n)
n2 = sheet.ncols
print(n2)
for row_index in range(0, n):
for col_index in range(0, n2):
col = sheet.cell(row_index, col_index).value
df = pd.DataFrame()
l1 = []
l2 = []
l3 = []
l4 = []
l5 = []
l6 = []
if (col == "Entity Name:"):
EntityName = sheet.cell(row_index, col_index + 1).value
if (col == "Counterparty Name:"):
CounterpartyName = sheet.cell(row_index, col_index + 1).value
if (col == "TradeRef"):
row_index = row_index + 1
value_1 = sheet.cell(row_index, col_index).value
while (value_1 != ""):
TradeRef = sheet.cell(row_index, col_index).value
TradeDate = sheet.cell(row_index, col_index + 1).value
TradeType = sheet.cell(row_index, col_index + 2).value
ConfirmationMedium = sheet.cell(row_index, col_index + 3).value
l1.append(TradeRef)
l2.append(TradeDate)
l3.append(TradeType)
l4.append(ConfirmationMedium)
row_index = row_index + 1
if (row_index >= n):
value_1 = ""
col_index = 4
else:
value_1 = sheet.cell(row_index, col_index).value
print(value_1)
df['TradeRef'] = l1
df['TradeDate'] = l2
df['TradeType'] = l3
df['ConfirmationMedium'] = l4
df['EntityName'] = EntityName
df['CoutrepartyName'] = CounterpartyName
print(df)
print(row_index)
print(col_index)
df1.append(df)
print(df1)
I got it resolved #Azat
import xlrd
import pandas as pd
book = xlrd.open_workbook("C:/Users/Vinod/Desktop/DataSet Mining/python-pandas.xlsx")
sheet = book.sheet_by_name('Sheet1')
df1=pd.DataFrame()
n=sheet.nrows
n2=sheet.ncols
for row_index in range(0,n):
for col_index in range(0,n2):
if(row_index)<n:
col=sheet.cell(row_index,col_index).value
df=pd.DataFrame()
l1=[]
l2=[]
l3=[]
l4=[]
l5=[]
l6=[]
if(col=="Entity Name:"):
EntityName=sheet.cell(row_index,col_index+1).value
if(col=="Counterparty Name:"):
CounterpartyName=sheet.cell(row_index,col_index+1).value
if(col=="TradeRef"):
row_index=row_index+1
value_1=sheet.cell(row_index,col_index).value
while( value_1!=""):
TradeRef=sheet.cell(row_index,col_index).value
TradeDate=sheet.cell(row_index,col_index+1).value
TradeType=sheet.cell(row_index,col_index+2).value
ConfirmationMedium=sheet.cell(row_index,col_index+3).value
l1.append(TradeRef)
l2.append(TradeDate)
l3.append(TradeType)
l4.append(ConfirmationMedium)
row_index=row_index+1
if(row_index>=n):
value_1=""
col_index=4
else:
value_1=sheet.cell(row_index,col_index).value
df['TradeRef']=l1
df['TradeDate']=l2
df['TradeType']=l3
df['ConfirmationMedium']=l4
df['EntityName']=EntityName
df['CoutrepartyName']=CounterpartyName
df1=df1.append(df)
print(df1)
Is there any way to split/unmerge cells in excel workbook using python? What I want is explained below -
The result should a new excel file with following entries -
My solution using xlrd to copy the same string for all merged column is as given below -
[Note: "formatted_info = True" flag is not yet implemented in xlrd which I am using hence I cannot directly get the list of merged cells.. I am not supposed to upgrade xlrd on the setup.]
def xlsx_to_dict():
workbook = xlrd.open_workbook(xlsfile)
worksheet_names = workbook.sheet_names()
for worksheet_name in worksheet_names:
worksheet = workbook.sheet_by_name(worksheet_name)
num_rows = worksheet.nrows - 1
num_cells = worksheet.ncols - 1
curr_row = -1
header_row = worksheet.row(0)
columns = []
for cell in range(len(header_row)):
value = worksheet.cell_value(0, cell)
columns.append(value)
cities = []
for row in range(1,num_rows):
value = worksheet.cell_value(row,0)
type = worksheet.cell_type(row,0)
if not value == "":
cities.append(value)
names = []
for row in range(1,num_rows):
value = worksheet.cell_value(row,1)
type = worksheet.cell_type(row,1)
if not value == "":
names.append(value)
current_city = cities[0]
result_dict = {}
for curr_row in range(1,num_rows):
row = worksheet.row(curr_row)
curr_cell = -1
curr_name = names[0]
while curr_cell < num_cells:
curr_cell += 1
cell_value = worksheet.cell_value(curr_row, curr_cell)
if cell_value in cities and curr_cell == 0:
current_city = cell_value
if not result_dict.has_key(current_city):
result_dict[current_city] = {}
continue
if cell_value == "" and curr_cell == 0:
continue
if cell_value in names and curr_cell == 1:
curr_name = cell_value
if not result_dict[current_city].has_key(curr_name):
result_dict[current_city][curr_name] = {}
continue
if cell_value == "" and curr_cell == 1:
continue
try:
result_dict[current_city][curr_name]['Phone'].append(cell_Value)
except:
result_dict[current_city][curr_name]['Phone'] = [cell_value]
The above function will return python dictionary as below -
{ 'New York' : { 'Tom' : [92929292, 33929] }, ........}
I will then traverse the directory and write new excel.
However, I want some generic way of splitting merged cells.
This function gets the "real" cell value, i.e., the value of the merged cell if the coordinates are anywhere inside the merged cell.
def unmergedValue(rowx,colx,thesheet):
for crange in thesheet.merged_cells:
rlo, rhi, clo, chi = crange
if rowx in xrange(rlo, rhi):
if colx in xrange(clo, chi):
return thesheet.cell_value(rlo,clo)
#if you reached this point, it's not in any merged cells
return thesheet.cell_value(rowx,colx)
Loosely based on http://www.lexicon.net/sjmachin/xlrd.html#xlrd.Sheet.merged_cells-attribute
Very innefficient, but should be acceptable for small-ish spreadsheets.
if your file has no empty cells in the middle, this may help, read the file, do some job, rewrite it.
def read_merged_xls(file_contents):
book = xlrd.open_workbook(file_contents=file_contents)
data = []
sheet = book.sheet_by_index(0)
for rx in range(sheet.nrows):
line = []
for ry in range(sheet.ncols):
cell = sheet.cell_value(rx,ry)
if not cell:
cell = data[-1][ry] if data else ''
line.append(cell)
data.append(line)
return data
import xlrd
import xlsxwriter
import numpy as np
import pandas as pd
def rep(l,i):
j= i
while(j>=0):
if not l[j-1] == u'':
return l[j-1]
else:
j = j-1
def write_df2xlsx(df,filename):
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(filename,engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Sheet1', index = False)
# Close the Pandas Excel writer and output the Excel file.
writer.save()
def csv_from_excel(filename):
wb = xlrd.open_workbook(filename)
worksheet_names = wb.sheet_names()
for worksheet_name in worksheet_names:
sh = wb.sheet_by_name(worksheet_name)
#To find the headers/column names of the xlsx file
header_index = 0
for i in range(sh.nrows):
if(len(filter(lambda x: not (x.value == xlrd.empty_cell.value), sh.row(i))) == len(sh.row(i))):
header_row = sh.row(i)
header_index = i
break
columns = []
for cell in range(len(header_row)):
value = sh.cell_value(header_index, cell)
columns.append(value)
rows = []
for rownum in range(header_index+1,sh.nrows):
rows.append(sh.row_values(rownum))
data = pd.DataFrame(rows,columns = columns)
cols = [col for col in data.columns if u'' in list(data[col])]
res = []
for col in cols:
t_list = list(data[col])
res.append(map(lambda x,y: rep(list(data[col]),y[0]) if x == u'' else x,t_list,enumerate(t_list)))
for (col,r) in zip(cols,res):
data[col] = pd.core.series.Series(r)
write_df2xlsx(data,'ResultFile.xlsx')