Retrieve from dict based on list values - python

Here is my data -
inp = [{'father_husband_mother_name': [['Father s Name', 0.8603670001029968],
['Shripati', 0.8603670001029968],
['Father s Name', 0.8903670001029969],
['Shpppati', 0.8903670001029969]],
'doc_id': [['GGX2176', 0.8435981869697571],
['GGC2176', 0.8835981869697571]],
'name': [['Elector s Name', 0.8301510810852051],
['Shibshankar Ghosh', 0.8301510810852051],
['Elector s Name', 0.8501510810852051],
['Shibshankar Ghosh', 0.8501510810852051]],
'date_of_birth': [['Age as on 1.1.2000', 0.8067844915390014],
['15', 0.8067844915390014],
['Age as on 1.1.2000', 0.8267844915390015],
['15', 0.8267844915390015]],
'gender_sex': [['Sex', 0.7784658074378967],
['M', 0.7784658074378967],
['Sex', 0.8784658074378967],
['M', 0.8784658074378967]]}]
STOPWORDS = ['Sex', 'Father s Name', 'Elector s Name', 'Address', 'Name', 'Gender', 'Mother s Name',
'Husband s Name']
The output that I expect:
{'father_husband_mother_name': 'Shpppati',
'doc_id': 'GGC2176',
'name': 'Shibshankar Ghosh',
'date_of_birth': 'Age as on 1.1.2000,15',
'gender_sex': 'M'}
Here is the logic -
Retrieve the value that has the highest confidence score [the float inside the list of lists] that is not present in STOPWORDS for each key.
What I have tried -
def process_kie_dict(voter_raw_labels, threshold=0.7):
cleaned_dict = {}
intermediate_dict = {}
for entity_dict in voter_raw_labels:
for entity, val in entity_dict.items():
conf_val = [item[1] for item in val]
unique_val = list(set(conf_val))
max_conf = max(unique_val)
if max_conf > threshold:
if len(unique_val)==1:
add_val = [item[0] for item in val]
else:
max_conf_index = conf_val.index(max_conf)
add_val = [item[0] for item in val[max_conf_index:]]
if entity not in intermediate_dict.keys():
intermediate_dict[entity] = [add_val,max_conf]
else:
if intermediate_dict[entity][1] < max_conf:
intermediate_dict[entity] = [add_val,max_conf]
# print(intermediate_dict)
for key, val in intermediate_dict.items():
final_value = ''
for value in val[0]:
m = len(str.strip(value))
edit_dist_list = []
for word in STOPWORDS:
n = len(word)
edit_dist = editDistDP(value, word, m, n)
edit_dist_list.append(edit_dist)
if min(edit_dist_list) < 2:
value=''
final_value = final_value + value + ','
clean_value = final_value.strip(",")
cleaned_dict[key]=clean_value
return cleaned_dict
def editDistDP(str1, str2, m, n):
# Create a table to store results of subproblems
dp = [[0 for x in range(n + 1)] for x in range(m + 1)]
# Fill d[][] in bottom up manner
for i in range(m + 1):
for j in range(n + 1):
# If first string is empty, only option is to
# insert all characters of second string
if i == 0:
dp[i][j] = j # Min. operations = j
# If second string is empty, only option is to
# remove all characters of second string
elif j == 0:
dp[i][j] = i # Min. operations = i
# If last characters are same, ignore last char
# and recur for remaining string
elif str1[i-1] == str2[j-1]:
dp[i][j] = dp[i-1][j-1]
# If last character are different, consider all
# possibilities and find minimum
else:
dp[i][j] = 1 + min(dp[i][j-1], # Insert
dp[i-1][j], # Remove
dp[i-1][j-1]) # Replace
return dp[m][n]
You can forget about the edit distance implementation, not important. What I want to know is given nested for loops, this won't work at scale. Looking for a more efficient implementation.

Here is a parser for your data
result = {k: sorted(v, key=lambda x: x[1] if x[0] not in STOPWORDS else 0)[-1][0] for k, v in inp[0].items()}
In short, it takes a key and sorts the rest of the dictionary based on the confidence value, unless the first element of the list is included in STOPWORDS. Then adds the first element of that sorted list to the result dictionary as a value.

Related

Split list according to the rule

I have a list
values_list = [1013.0, 683.0, 336.0, 406.0, 636.0, 1065.0, 1160.0]
Also I have a value
value = 660.6153846153846
This list is based on the assumption that there are 3 stages. First stage should be higher that the value, second - lower, and third is again higher.
I want to split this list into three lists, saving the order of the values like this:
values_list = [[1013.0, 683.0], [336.0, 406.0, 636.0], [1065.0, 1160.0]]
Try this one, using groupby:
from itertools import groupby
values_list = [1013.0, 683.0, 336.0, 406.0, 636.0, 1065.0, 1160.0]
value = 660.6153846153846
result = list(list(b) for a,b in groupby(values_list, lambda x: x < value ))
print (result)
Result:
[[1013.0, 683.0], [336.0, 406.0, 636.0], [1065.0, 1160.0]]
Try this one:
splits = []
splt = []
s = 0
for v in values_list:
if len(splt) > 0:
if v > value and s != 1:
splits.append(splt)
splt = []
elif v <= value and s != -1:
splits.append(splt)
splt = []
splt.append(v)
s = 2*(v > value) - 1
if len(splt) > 0:
splits.append(splt)

looping backward or forward in python list to find a match

I have a python list in which I will search and find one term. Once I find it, I need to go backward in the list and find the first occurrence with = and go forward and find the first occurrence with ;.
I tried using while loop but it is not working.
extract = [1,2,"3=","fd","dfdf","keyword","ssd","sdsd",";","dds"]
indices = [i for i,s in enumerate(extract) if 'keyword' in s]
for ind in indices:
ind_while_for = ind
ind_while_back = ind
if ('=' in extract[ind]) & (';' in extract[ind]):
print(extract[ind])
if (';' in extract[ind]) & ('=' not in extract[ind]):
while '=' in extract[ind_while_back-1]:
ind_while_back -= 1
print(' '.join(extract[ind_while_back:ind]))
result required : 3= fd dfdf keyword ssd sdsd ;
Find the position of the keyword:
kw = extract.index("keyword")
Find the element with the largest index that contains an "=" in the sublist of the original list before the position of the keyword:
eq = max(i for i,w in enumerate(extract[:kw])
if isinstance(w,str) and "=" in w)
Find the element with the smallest index that contains a ";" in the sublist from the previous element to the end:
semi = min(i for i,w in enumerate(extract[eq:], eq)
if isinstance(w,str) and ';' in w)
Extract the sublist between the two extremes:
extract[eq:semi+1]
#['3=', 'fd', 'dfdf', 'keyword', 'ssd', 'sdsd', ';']
You can use:
l = [1, 2, "3=", "fd", "dfdf", "keyword", "ssd", "sdsd", ";", "dds"]
s = "keyword"
def take(last, iterable):
l = []
for x in iterable:
l.append(x)
if last in x:
break
return l
# get all elements on the right of s
right = take(';', l[l.index(s) + 1:])
# get all elements on the left of s using a reversed sublist
left = take('=', l[l.index(s)::-1])
# reverse the left list back and join it to the right list
subl = left[::-1] + right
print(subl)
['3=', 'fd', 'dfdf', 'keyword', 'ssd', 'sdsd', ';']
Try below function :
extract = ['1','2','3=','fd','dfdf','keyword','ssd','sdsd',';','dds']
def get_output_list(extract, key):
equals_indices = [i for i,j in enumerate(extract) if '=' in j]
semicolon_indices = [i for i,j in enumerate(extract) if ';' in j]
if key not in extract or len(equals_indices) == 0 or len(semicolon_indices) == 0:
return 'no match found1'
keyword_index = extract.index(key)
if any([keyword_index<i for i in semicolon_indices]) and any([keyword_index>i for i in equals_indices]) :
required_equal_index = keyword_index - equals_indices[0]
required_semicolon_index = semicolon_indices[0] - keyword_index
for i in equals_indices:
if (i < keyword_index) and required_equal_index > i:
required_equal_index = i
for i in semicolon_indices:
if (i > keyword_index) and (required_semicolon_index < i) :
required_semicolon_index = i
return extract[required_equal_index:required_semicolon_index+1]
else :
return 'no match found'

Combine elements that has same pattern in a list to strings

I have a list of strings that could vary in pattern.
lst = ['ban-eur.kd', 'ban-eur.sd', 'ban-eur.td' ]
When converted this should become ban-eur<kd,sd,td>.
It should combine elements that are next only if they can be combined. (only if it matches the pattern ban-eur)
lst = ['ban-eur.kd', 'kd', 'ban-eur.sd', 'ban-eur.td' ]
This should result in 'ban-eur.kd_kd_ban-eur<sd,td>'.
If it doesn't have any element that could be combined then it should all be just joined with a _
How can I do this, without missing the first element in the array/duplicate with in the string.
Thanks for your time.
You can use itertools.groupby:
import itertools, re
lst = ['ban-eur.kd', 'ban-eur.sd', 'ban-eur.td' ]
def group_result(d:list) -> list:
if len(d) == 1:
return d[0]
new_result = [[a, list(b)] for a, b in itertools.groupby(sorted(d, key=lambda x:x.split('.')[0]), key=lambda x:x.split('.')[0])]
return '_'.join('{}<{}>'.format(a, ','.join(i.split('.')[-1] for i in b)) for a, b in new_result)
new_data = '_'.join(group_result(list(b)) if a else '_'.join(list(b)) for a, b in itertools.groupby(lst, key=lambda x:'.' in x))
Output:
'ban-eur<kd,sd,td>'
When running on ['ban-eur.kd', 'kd', 'ban-eur.sd', 'ban-eur.td']:
'ban-eur.kd_kd_ban-eur<sd,td>'
matches = []
resulting_string
for item in lst:
if item.startsWith('ban-eur'):
matches.append(item)
elif not item.startsWith('ban-eur') and len(matches) >= 1:
if len(matches) == 1:
resulting_string += item
else:
resulting_string += 'ban-eur.<'
for s in matches:
resulting_string += s + ', '
resulting_string += '>'
matches = []
resulting_string += '_' + item + '_'
This should work
pre = ''
result = []
endings = []
for item in lst:
if item.split('.')[0] == pre:
endings.append(item.split('.')[1])
else:
if endings:
result.append(pre+'<'+','.join(endings)+'>')
else:
result.append(item)
pre = item.split('.')[0]
endings = []
print('_'.join(result))

find an empty value gap in a list and allocate a group of strings

I am new in Python and I am looking for an elegant way to do this:
list0 = ["text1","text2","","","text3","","text4","text5","","text6"]
I want to group together the non-empty strings that are detected after the gap in indices 2 and 3, and allocate this group starting at a specific index (e.g. index 5). The new list should look like list1. (To see list1) click here:
well, I am not sure if this answers your post
import Queue
list0 = ["text","text","","","text","","text","text","","text"]
queue = Queue.Queue()
count = 0
for i, val in enumerate(list0):
if val == "":
count += 1
else:
if count > 1:
queue.put(i)
count = 0
index = 5
queue.put(len(list0))
while queue.qsize() > 1:
begin = queue.get()
tmp = filter(lambda a: a != "", list0[begin:queue.queue[0]])
list0[begin:queue.queue[0]] = [""] * (queue.queue[0]-begin)
list0[index:index+len(tmp)] = tmp
I just scan through the list and sort them by block between gaps
['text', 'text', '', '', '', 'text', 'text', 'text', 'text', '']
Well, I am wondering if I am right.
From the description of the question, this seems to be what you want, but I'm unsure why there is an empty string at the end of your sample output.
Also, this directly modifies list0 , so feel free to change the references to list1 if you wish.
list0 = ["text","text","","","text","","text","text","","text"]
# Find the "gap" - the first consectutive empty strings
# gap_pos remains 0 if no gap is found
gap_pos = 0
gap_size = 2
for i in range(len(list0)-gap_size):
if all(x == '' for x in list0[i:i+gap_size]):
gap_pos = i+1
break # remove this if you want the last gap
# Find the non-empty strings that are detected after the gap
after_gap = filter(lambda x : x != '', list0[gap_pos+1:])
# allocate this group starting at a specific index (e.g. index 5)
specific_index = 5
for i in range(len(after_gap)):
allocate_at = i + specific_index
# Make sure not to go out-of-bounds
if allocate_at < len(list0):
list0[allocate_at] = after_gap[i]
Outputs
['text', 'text', '', '', 'text', 'text', 'text', 'text', 'text', 'text']
If it's missing anything tell me
list0 = ["text","text","","","text","","text","text","","text"]
list1 = list0[:]
counter= 0
for i in range(0,len(list0)):
if(list0[i]=="" and counter < 2 ):
counter= counter + 1
elif(counter >= 2):
list1[i] = "text"
elif(counter < 2):
list1[i] = list0[i]

Splitting tuple item in a list

How do I split the 2nd tuple
data = [('152', 'Farko', 'Kier'), ('153', 'Park - Pub')]
to get this output:
[('152', 'Farko', 'Kier'), ('153', 'Park', 'Pub')]
I tried this way:
lst = []
for i in data:
if len(i) == 2:
i[1] = tuple(i[1].split(' - '))
lst.append(i)
It'd work, except it raised an exception TypeError: 'tuple' object does not support item assignment. But I can't assign i = tuple(i[1].split(' - ')) because I need to keep the number which is in position i[0] in tuple. List comprehansion solution would be greatly welcome. Suggestions?
You could use a list comprehension:
lst = [t[:1] + tuple(t[1].split(' - ')) if len(t) == 2 else t for t in data]
or you could adjust your loop to create a new tuple:
for i in data:
if len(i) == 2:
i = i[:1] + tuple(i[1].split(' - '))
lst.append(i)
newdata = [tuple(s for t in tu for s in t.split(' - ')) for tu in data]
# [('152', 'Farko', 'Kier'), ('153', 'Park', 'Pub')]
As you have seen you can't modify the tuple, however you can just create a new one with the data you want:
data = [('152', 'Farko', 'Kier'), ('153', 'Park - Pub')]
lst = []
for i in data:
if len(i) == 2:
i = i[:1] + tuple(i[1].split(' - '))
lst.append(i)
Or if you want to modify the original list:
for i, tup in enumerate(data):
if len(tup) == 2:
data[i] = tup[:1] + tuple(tup[1].split(' - '))

Categories

Resources