I have a long list of names in Column A and its x,y coordinates in column B and C. I want the script to find:
a partial string match in the names in column a and also pick the one with the same y coordinates:
fine max and min values from x coordinates
append ‘start’ to the max x value cell name. Append ‘end’ to the min x value cell name
Sheet
`
import xlwings as xw
wb = xw.books.active
def rename():
sht = wb.sheets['new Sheet']
dict1 = {}
ccount = 0
mr = sht.range(
'A' + str(wb.sheets[name].cells.last_cell.row)).end('up').row
for cell in sht.range(f'A2:A{mr}'):
if 'DAD' in cell.value:
# x_coord = int(sht.range(f'B{cell.row}').value)
# y_coord = int(sht.range(f'C{cell.row}').value)
for yv in sht.range(f'C{cell.row}'):
if yv.value == yv.offset(1, 0).value:
ccount = ccount+1
if ccount >= 4:
y_coord = int(sht.range(f'C{yv.row}').value)
x_coord = int(sht.range(f'B{yv.row}').value)
if y_coord in dict1:
dict1[y_coord] += [x_coord]
else:
dict1[y_coord] = [x_coord]
count = 0
start = 2
for key, value in dict1.items():
value.sort(reverse=True)
for enum, i in enumerate(value, start):
if enum == start:
count = 2
first_name = [f'A{enum}+_START', i, key]
sht.range(f'A{enum}').value = first_name
elif enum == len(value)+start-1:
last_name = [f'A{enum}+_END', i, key]
sht.range(f'A{enum}').value = last_name
else:
between_names = [f'DAD_{count}', i, key]
sht.range(f'A{enum}').value = between_names
if enum % 2 == 0:
count += 1
Your requirement and the data you are working with is clearer so now I can give a better example.
The code will create a dictionary using the Y coord as the key and the x cords as values.
Using your updated information; the dictionary 'dict1' would have three keys 67475, 57475 & 47475. All X cordinates that have a name including the text 'DAD' is added to the corresponding Y coord dict key as a value.
Then simply iterate through each key, sort the values in reverse so we know the first item is MAX value and the last is MIN value and then print each Y coord group with generated name and X coord separately.
def rename(fn):
out_sheet = 'Output' # Name of sheet to write output to
dict1 = {} # Dictionary for unsorted data
header_list = [] # List for headers
### Use a context manager with Xlwings
with xw.App() as app:
### Open Workbook and input sheet
wb = xw.Book(fn)
sht = wb.sheets['new Sheet']
### Get maximum rows in the sheet
mr = sht.range('A' + str(wb.sheets[0].cells.last_cell.row)).end('up').row
### Extract the data ###
### Iterate column A rows 1 - max rows
for cell in sht.range(f'A1:A{mr}'):
### For row 1 capture the Header text in to list header_list
if cell.row == 1:
for header_cell in sht.range(f'A1:C1'):
header_list.append(header_cell.value)
continue
### Checks if the text 'DAD' exists in the current cell in the Name column (A)
### then adds the x coord to a dictionary with y coord as keys
if 'DAD' in cell.value.upper():
### Create dictionary of cell values that match the criteria using X coord as key
x_coord = int(sht.range(f'B{cell.row}').value)
y_coord = int(sht.range(f'C{cell.row}').value)
if y_coord in dict1:
dict1[y_coord] += [x_coord]
else:
dict1[y_coord] = [x_coord]
### Output the data ###
### dict1 has 1 key per unique Y coord, Values are all the x coords for that Y coord
#
### Create the sheet 'Output' for write if not exist and assign variable sht2
if out_sheet not in wb.sheet_names:
sht2 = wb.sheets.add(out_sheet, after=wb.sheets[0])
### Add headers to new sheet
sht2['A1'].value = header_list
sht2.range('A1:C1').font.bold = True
else:
sht2 = wb.sheets['Output']
### Print and write data to the screen and output sheet
count = 0
start = 2
for key, value in dict1.items():
### Sort x cords into descending order
value.sort(reverse=True)
for enum, i in enumerate(value,start):
if enum == start:
### First x cord is max value
count = 2
tstr_start = ['DAD_START', i, key]
print(tstr_start, end=' ')
sht2.range(f'A{enum}').value = tstr_start
elif enum == len(value)+start-1:
### Last x cord is min value
tstr_end = ['DAD_END', i, key]
print(tstr_end)
sht2.range(f'A{enum}').value = tstr_end
else:
### In Between cells use count for the naming
### Use the 'count value that gets incremented every 2nd loop
tstr_mid = [f'DAD_{count}', i, key]
print(tstr_mid, end=' ')
sht2.range(f'A{enum}').value = tstr_mid
if enum % 2 == 0:
count += 1
print('')
start = enum+2
print('\n--------------\n')
### Save Workbook
wb.save(filename)
if __name__ == '__main__':
filename = 'foo.xlsx'
rename(filename)
Related
enter image description here
import openpyxl
def sum_gr_values(file_name):
# Load the workbook
workbook = openpyxl.load_workbook(file_name)
sheet = workbook.active
# Find the first yellow-filled cell
yellow_cell = None
for row in sheet.iter_rows():
for cell in row:
if cell.fill.start_color.rgb == '#FFFF00':
yellow_cell = cell
break
if yellow_cell:
break
# Set the start and end row variables
start_row = yellow_cell
end_row = start_row
# Find the last yellow-filled cell
for row in range(start_row + 1, sheet.max_row + 1):
if sheet.cell(row=row, column=yellow_cell.column).fill.start_color.index == 'FFFF00':
end_row = row
else:
break
# Check if start_row and end_row have values
if start_row and end_row:
# Initialize a variable to store the sum
total_sum = 0
# Sum the values in column "E" if the value in column "F" is "GR"
for row in range(start_row, end_row + 1):
if sheet.cell(row=row, column=6).value == "GR":
total_sum += sheet.cell(row=row, column=5).value
# Write the sum to the first yellow-filled cell in column "E"
yellow_cell.offset(0, -1).value = total_sum
# Save the changes to the file
workbook.save(file_name)
file_name = "file.xlsx"
sum_gr_values(file_name)
In the "# Find the first yellow-filled cell" section yellow cell is not dedected. I've also add the excel screnshot. Tried to debug but could not understand problem. I am new in coding so it might be an easy one.
It needs to detect the yellow rows.
You can use openpyxl Python library to parse an Excel document, iterate over its cells, and check the cell value, style, color, etc.
https://stackoverflow.com/questions/69410376/how-to-detect-color-of-excel-xlsx-row[enter link description here]1
Check the last 6 characters
if cell.fill.start_color.rgb[-6:] == 'FFFF00':
import openpyxl
from openpyxl.styles import PatternFill
def sum_gr_values(file_name):
# list of yellow cells
cellY = []
# Load the workbook
workbook = openpyxl.load_workbook(file_name)
sheet = workbook.active
# Find the yellow-filled rows
for r in sheet.iter_rows():
if r[0].fill.start_color.rgb[-6:] == 'FFFF00':
cellY.append(r[0])
#append last row
cellY.append(r[0].offset(1))
# check results
for n in range(0,len(cellY)):
print( "n: {} rowY: {}".format(n,cellY[n].row) )
print()
## Check have 2 rows
if len(cellY) < 2:
print ( "ERROR - No yellow rows found" )
exit()
# summate each pair of yellow lines
for n in range(0,len(cellY)-1):
# Initialize a variable to store the sum
total_sum = 0
r0 = cellY[n].row # start row
r1 = cellY[n+1].row # end row
print ( "n={} r0={} r1={}".format(n,r0,r1) )
## Sum the values in column "E" if the value in column "F" is "GR"
for rowno in range(r0+1, r1):
colE = sheet.cell(rowno, 5).value
colF = sheet.cell(rowno, 6).value
print (rowno,colE, colF)
if colF == "GR":
total_sum += colE
## Write the sum to the first yellow-filled cell in column "E"
sheet.cell(r0, 5).value = total_sum
sheet.cell(r0, 5).fill = PatternFill(start_color="FFFF00", fill_type = "solid")
print ( "total:{}\n".format(total_sum) )
# Save the changes to the file
workbook.save(file_name)
# program
file_name = "file.xlsx"
sum_gr_values(file_name)
When I use splitter_freqy.py to split my .dat filem I meet the following problem.
line 5 name=str(sys.argv[position+1])
indexerror list index out of range
I use ubuntu and the command is
python3.6 splitter_freq.py abc.dat
This is the splitter_freq.py
import sys
arguments = len(sys.argv) - 1
position = 1
name = str(sys.argv[position+1])
while (arguments > position): # changed >= to > to avoid error
bench_freq_names = []
path0 = "./%s/"%name+sys.argv[position]
position = position+1
print("the input file is ",path0)
c_0 = open(path0,'r')
header=c_0.readline()
l_0 = c_0.readline()
w_0 = l_0.split()
bench_name = w_0[1]
freq_value = w_0[3]
path_out = path0.split("/")
path_out = path_out[-1].split(".")
print("the benchmark name is ",bench_name)
print("the freq value is ",freq_value)
bench_freq_names.append([bench_name,freq_value])
output_file = "./%s/"%name+path_out[0]+"_"+bench_name+"_"+freq_value+".txt"
m = open(output_file,'a')
print(header.rstrip('\n'),file=m)
print(l_0.rstrip('\n'),file=m)
for l_0 in c_0: #read lines one by one
w_0 = l_0.split()
if (w_0[1] == bench_name and w_0[3] == freq_value):
print(l_0.rstrip('\n'),file=m)
else: #new_bench_name or new freq_value
m.close() #close file
bench_name = w_0[1] #update bench
freq_value = w_0[3] #update freq
print("the benchmark name is ",bench_name)
print("the freq value is ",freq_value)
output_file = "./%s/"%name+path_out[0]+"_"+bench_name+"_"+freq_value+".txt"
m = open(output_file,'a')
if [bench_name,freq_value] not in bench_freq_names:
bench_freq_names.append([bench_name,freq_value])
print(header.rstrip('\n'),file=m)
print(l_0.rstrip('\n'),file=m)
c_0.close()
m.close()
Hello now im working on my project. I want to get candidate of text block by using algorithm below.
My input is a csv document which contain :
HTML column : the html code in a line
TAG column : the tag of html code in a line
Words : the text inside the tag in aline
TC : the number of words in a line
LTC : the number of anchor words in a line
TG : the number of tag in a line
P : the number of tag p and br in a line
CTTD : TC + (0.2*LTC) + TG - P
CTTDs : the smoothed CTTD
This is my algorithm to find candidate of text block. I make the csv file into dataframe using pandas. I am using CTTDs,TC and TG column to find the candidate.
from ListSmoothing import get_filepaths_smoothing
import pandas as pd
import numpy as np
import csv
filenames = get_filepaths_smoothing(r"C:\Users\kimhyesung\PycharmProjects\newsextraction\smoothing")
index = 0
for f in filenames:
file_html=open(str(f),"r")
df = pd.read_csv(file_html)
#df = pd.read_csv('smoothing/Smoothing001.csv')
news = np.array(df['CTTDs'])
new = np.array(df['TG'])
minval = np.min(news[np.nonzero(news)])
maxval = np.max(news[np.nonzero(news)])
j = 0.2
thetaCTTD = minval + j * (maxval-minval)
#maxGap = np.max(new[np.nonzero(new)])
#minGap = np.min(new[np.nonzero(new)])
thetaGap = np.min(new[np.nonzero(new)])
#print thetaCTTD
#print maxval
#print minval
#print thetaGap
def create_candidates(df, thetaCTTD, thetaGAP):
k = 0
TB = {}
TC = 0
for index in range(0, len(df) - 1):
start = index
if df.ix[index]['CTTDs'] > thetaCTTD:
start = index
gap = 0
TC = df.ix[index]['TC']
for index in range(index + 1, len(df) - 1):
if df.ix[index]['TG'] == 0:
continue
elif df.ix[index]['CTTDs'] <= thetaCTTD and gap >= thetaGAP:
break
elif df.ix[index]['CTTDs'] <= thetaCTTD:
gap += 1
TC += df.ix[index]['TC']
if (TC < 1) or (start == index):
continue
TB.update({
k: {
'start': start,
'end': index - 1
}
})
k += 1
return TB
def get_unique_candidate(TB):
TB = tb.copy()
for key, value in tb.iteritems():
if key == len(tb) - 1:
break
if value['end'] == tb[key+1]['end']:
del TB[key+1]
elif value['start'] < tb[key+1]['start'] < value['end']:
TB[key]['end'] = tb[key+1]['start'] - 1
else:
continue
return TB
index += 1
stored_file = "textcandidate/textcandidate" + '{0:03}'.format(index) + ".csv"
tb = create_candidates(df, thetaCTTD, thetaGap)
TB = get_unique_candidate(tb)
filewrite = open(stored_file, "wb")
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
output_df.to_csv(stored_file)
writer = csv.writer(filewrite, lineterminator='\n')
filewrite.close
ThetaCTTD is 10.36 and thethaGap is 1.
The output is
The output means there are 2 candidates of text block . First the candiate of text block start from line number 215 and end line number 225 (like the pict bellow). And the other candidate of text block start from line number 500 and end line number 501.
My question is how to save the output into csv and not only the number of line but the range of the text block and the others column will appear as the output too?
My expected output is like the screenshot of candidate text block is like this one
Assuming your output is a list of dictionaries:
pd.concat([df.loc[d['start']:d['end']] for (k, d) in TB.iteritems()])
Note that we slice by label, so d['end'] will be included.
Edit: add the candidate number in a new column.
It's cleaner to write a loop than to do two concat operations:
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
It's also faster to concatenate all dataframes at once at the end.
I am trying to retrieve all data from an sqlite3 database, format it and save it to a file, read that file and try to sort the records alphabetically then save that to a new file but It doesn't save them to the new file properly and i end up with more than one copy of that record. Can somebody help?
What I am trying to do:
Retrieve records from a database
Format these records
Save to a (unsorted) file
Take out the records from the (unsorted) file
Sort the records alphabetically
Save the (sorted) records to a new file
Text to avoid code after list bug
c.execute("SELECT * FROM Student, Behaviour")
data = c.fetchall()
currentRecords = open('Current Records - Unsorted', 'w')
l = []
for i in data: #for individual records in the whole database do:
record = str(i)
record = record.replace("u'","")
record = record.replace("'", "")
record = record.replace('"', '')
record = record.replace("(","")
record = record.replace(")", "")
record = record.replace(","," -")
currentRecords.write(record+"\r\n")
currentRecords.write('------------------------------------------------------------------------------'+"\r\n")
currentRecords.close()
y = open('Current Records - Unsorted','r')
z = y.read() #opening the file containing the unsorted, formatted records to read
l.append(z)
y.close()
#sort the array alphabetically
def sort(l):
less = []
equal = []
greater = []
if len(l) > 1:
pivot = l[0]
for x in l:
if x < pivot:
less.append(x)
elif x == pivot:
equal.append(x)
elif x > pivot:
greater.append(x)
return sort(less) + equal + sort(greater)
else:
if len(l) == 1 or len(l) == 0:
return l
sortedCurrentRecords = sort(l)
sortedCurrentRecordsFile = open('Current Records', 'w')
for individualRecords in sortedCurrentRecords:
sortedCurrentRecordsFile.write(individualRecords)
sortedCurrentRecordsFile.close()
sortedCurrentRecordsFile1 = 'Current Records'
subprocess.call(['open','-a','TextEdit', sortedCurrentRecordsFile1])
Is there any way to split/unmerge cells in excel workbook using python? What I want is explained below -
The result should a new excel file with following entries -
My solution using xlrd to copy the same string for all merged column is as given below -
[Note: "formatted_info = True" flag is not yet implemented in xlrd which I am using hence I cannot directly get the list of merged cells.. I am not supposed to upgrade xlrd on the setup.]
def xlsx_to_dict():
workbook = xlrd.open_workbook(xlsfile)
worksheet_names = workbook.sheet_names()
for worksheet_name in worksheet_names:
worksheet = workbook.sheet_by_name(worksheet_name)
num_rows = worksheet.nrows - 1
num_cells = worksheet.ncols - 1
curr_row = -1
header_row = worksheet.row(0)
columns = []
for cell in range(len(header_row)):
value = worksheet.cell_value(0, cell)
columns.append(value)
cities = []
for row in range(1,num_rows):
value = worksheet.cell_value(row,0)
type = worksheet.cell_type(row,0)
if not value == "":
cities.append(value)
names = []
for row in range(1,num_rows):
value = worksheet.cell_value(row,1)
type = worksheet.cell_type(row,1)
if not value == "":
names.append(value)
current_city = cities[0]
result_dict = {}
for curr_row in range(1,num_rows):
row = worksheet.row(curr_row)
curr_cell = -1
curr_name = names[0]
while curr_cell < num_cells:
curr_cell += 1
cell_value = worksheet.cell_value(curr_row, curr_cell)
if cell_value in cities and curr_cell == 0:
current_city = cell_value
if not result_dict.has_key(current_city):
result_dict[current_city] = {}
continue
if cell_value == "" and curr_cell == 0:
continue
if cell_value in names and curr_cell == 1:
curr_name = cell_value
if not result_dict[current_city].has_key(curr_name):
result_dict[current_city][curr_name] = {}
continue
if cell_value == "" and curr_cell == 1:
continue
try:
result_dict[current_city][curr_name]['Phone'].append(cell_Value)
except:
result_dict[current_city][curr_name]['Phone'] = [cell_value]
The above function will return python dictionary as below -
{ 'New York' : { 'Tom' : [92929292, 33929] }, ........}
I will then traverse the directory and write new excel.
However, I want some generic way of splitting merged cells.
This function gets the "real" cell value, i.e., the value of the merged cell if the coordinates are anywhere inside the merged cell.
def unmergedValue(rowx,colx,thesheet):
for crange in thesheet.merged_cells:
rlo, rhi, clo, chi = crange
if rowx in xrange(rlo, rhi):
if colx in xrange(clo, chi):
return thesheet.cell_value(rlo,clo)
#if you reached this point, it's not in any merged cells
return thesheet.cell_value(rowx,colx)
Loosely based on http://www.lexicon.net/sjmachin/xlrd.html#xlrd.Sheet.merged_cells-attribute
Very innefficient, but should be acceptable for small-ish spreadsheets.
if your file has no empty cells in the middle, this may help, read the file, do some job, rewrite it.
def read_merged_xls(file_contents):
book = xlrd.open_workbook(file_contents=file_contents)
data = []
sheet = book.sheet_by_index(0)
for rx in range(sheet.nrows):
line = []
for ry in range(sheet.ncols):
cell = sheet.cell_value(rx,ry)
if not cell:
cell = data[-1][ry] if data else ''
line.append(cell)
data.append(line)
return data
import xlrd
import xlsxwriter
import numpy as np
import pandas as pd
def rep(l,i):
j= i
while(j>=0):
if not l[j-1] == u'':
return l[j-1]
else:
j = j-1
def write_df2xlsx(df,filename):
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(filename,engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Sheet1', index = False)
# Close the Pandas Excel writer and output the Excel file.
writer.save()
def csv_from_excel(filename):
wb = xlrd.open_workbook(filename)
worksheet_names = wb.sheet_names()
for worksheet_name in worksheet_names:
sh = wb.sheet_by_name(worksheet_name)
#To find the headers/column names of the xlsx file
header_index = 0
for i in range(sh.nrows):
if(len(filter(lambda x: not (x.value == xlrd.empty_cell.value), sh.row(i))) == len(sh.row(i))):
header_row = sh.row(i)
header_index = i
break
columns = []
for cell in range(len(header_row)):
value = sh.cell_value(header_index, cell)
columns.append(value)
rows = []
for rownum in range(header_index+1,sh.nrows):
rows.append(sh.row_values(rownum))
data = pd.DataFrame(rows,columns = columns)
cols = [col for col in data.columns if u'' in list(data[col])]
res = []
for col in cols:
t_list = list(data[col])
res.append(map(lambda x,y: rep(list(data[col]),y[0]) if x == u'' else x,t_list,enumerate(t_list)))
for (col,r) in zip(cols,res):
data[col] = pd.core.series.Series(r)
write_df2xlsx(data,'ResultFile.xlsx')