matching results in a list of lists - python

I have a list with the following structure;
[('0','927','928'),('2','693','694'),('2','742','743'),('2','776','777'),('2','804','805'),
('2','987','988'),('2','997','998'),('2','1019','1020'),
('2','1038','1039'),('2','1047','1048'),('2','1083','1084'),('2','659','660'),
('2','677','678'),('2','743','744'),('2','777','778'),('2','805','806'),('2','830','831')
the 1st number is an id, the second a position of a word and the third number is the position of a second word. What I need to do and am struggling with is finding sets of words next to each other.
These results are given for searches of 3 words, so there is the positions of word 1 with word 2 and positions of word 2 with word 3. For example ;
I run the phrase query "women in science" I then get the values given in the list above, so ('2','776','777') is the results for 'women in' and ('2','777','778') is the results for 'in science'.
I need to find a way to match these results up, so for every document it groups the words together depending on amounts of word in the query. (so if there is 4 words in the query there will be 3 results that need to be matched together).
Is this possible?

You need to quickly find word info by its position. Create a dictionary keyed by word position:
# from your example; I wonder why you use strings and not numbers.
positions = [('0','927','928'),('2','693','694'),('2','742','743'),('2','776','777'),('2','804','805'),
('2','987','988'),('2','997','998'),('2','1019','1020'),
('2','1038','1039'),('2','1047','1048'),('2','1083','1084'),('2','659','660'),
('2','677','678'),('2','743','744'),('2','777','778'),('2','805','806'),('2','830','831')]
# create the dictionary
dict_by_position = {w_pos:(w_id, w_next) for (w_id, w_pos, w_next) in positions}
Now it's a piece of cake to follow chains:
>>> dict_by_position['776']
('2', '777')
>>> dict_by_position['777']
('2', '778')
Or programmatically:
def followChain(start, position_dict):
result = []
scanner = start
while scanner in position_dict:
next_item = position_dict[scanner]
result.append(next_item)
unused_id, scanner = next_item # unpack the (id, next_position)
return result
>>> followChain('776', dict_by_position)
[('2', '777'), ('2', '778')]
Finding all chains that are not subchains of each other:
seen_items = set()
for start in dict_by_position:
if start not in seen_items:
chain = followChain(start, dict_by_position)
seen_items.update(set(chain)) # mark all pieces of chain as seen
print chain # or do something reasonable instead

The following will do what you're asking, as I understand it - it's not the prettiest output in the world, and I think that if possible you should be using numbers if numbers are what you're trying to work with.
There are probably more elegant solutions, and simplifications that could be made to this:
positions = [('0','927','928'),('2','693','694'),('2','742','743'),('2','776','777'),('2','804','805'),
('2','987','988'),('2','997','998'),('2','1019','1020'),
('2','1038','1039'),('2','1047','1048'),('2','1083','1084'),('2','659','660'),
('2','677','678'),('2','743','744'),('2','777','778'),('2','805','806'),('2','830','831')]
sorted_dict = {}
sorted_list = []
grouped_list = []
doc_ids = []
def sort_func(positions):
for item in positions:
if item[0] not in doc_ids:
doc_ids.append(item[0])
for doc_id in doc_ids:
sorted_set = []
for item in positions:
if item[0] != doc_id:
continue
else:
if item[1] not in sorted_set:
sorted_set.append(item[1])
if item[2] not in sorted_set:
sorted_set.append(item[2])
sorted_list = sorted(sorted_set)
sorted_dict[doc_id] = sorted_list
for k in sorted_dict:
group = []
grouped_list = []
for i in sorted_dict[k]:
try:
if int(i)-1 == int(sorted_dict[k][sorted_dict[k].index(i)-1]):
group.append(i)
else:
if group != []:
grouped_list.append(group)
group = [i]
except IndexError:
group.append(i)
continue
if grouped_list != []:
sorted_dict[k] = grouped_list
else:
sorted_dict[k] = group
return sorted_dict
My output for the above was:
{'0': ['927', '928'], '2': [['1019', '1020'], ['1038', '1039'], ['1047', '1048'], ['1083', '1084'], ['659', '660'], ['677', '678'], ['693', '694'], ['742', '743', '744'], ['776', '777', '778'], ['804', '805', '806'], ['830', '831'], ['987', '988']]}

Related

How to create a nest list or a tree view from a flat list based on value condition?

I am working on a problem that given a flat list of strings. But based on the name of the string I would like to create either a nested list, a dictionary or a Node class. Anything that can have a tree structure
The string looks something like:
['big_1', 'small_1', 'item_1', 'item_2', 'big_2', 'small_2', 'item_3']
This should be turned into something like
['big_1', ['small_1', ['item_1', 'item_2']], 'big_2', ['small_2', ['item_3']]]
or a nested dictionary:
{'big_1': { 'small_1': ['item_1', 'item_2']}, 'big_2': {'small_2': ['item_3']}}
The example has 3 levels but it can be of any amount of levels. I tried something like this because that is not correct:
x = ['a_1', 'b_1', 'b_2', 'a_2', 'b_3', 'a_3', 'b_4', 'b_5', 'b_6']
def category(input):
return input.split('_')[0]
cats = list(dict.fromkeys([category(i) for i in x])) # order preserved unique set
results = {}
prev_cat = ''
result = []
index = 0
for item in x:
if index == 0:
result.append(item)
index += 1
prev_cat = category(item)
else:
curr_cat = category(item)
if curr_cat == prev_cat:
# Same category, append
result.append(item)
else:
result.append([item])
print(result)
It returns ['a_1', ['b_1'], ['b_2'], 'a_2', ['b_3'], 'a_3', ['b_4'], ['b_5'], ['b_6']]
Any suggestion please?
I think I found a way to achieve this using recursion.
x = ["big_1", "small_1", "item_1", "item_2", "big_2", "small_2", "item_3"]
def category(input):
return input.split("_")[0]
def parse(l):
out = []
cat = category(l[0])
sublist = []
for el in l:
if category(el) != cat:
sublist.append(el)
else:
if len(sublist) > 0:
out.append(parse(sublist))
sublist = []
out.append(el)
if len(sublist) > 0:
out.append(parse(sublist))
sublist = []
return out
print(parse(x))
Basically what I do is recursively call the parse function each time I find a different level. The code is just a test and can definitely be improved.

How to find the highest value element in a list with reference to a dictionary on python

How do I code a function in python which can:
iterate through a list of word strings which may contain duplicate words and referencing to a dictionary,
find the word with the highest absolute sum, and
output it along with the corresponding absolute value.
The function also has to ignore words which are not in the dictionary.
For example,
Assume the function is called H_abs_W().
Given the following list and dict:
list_1 = ['apples','oranges','pears','apples']
Dict_1 = {'apples':5.23,'pears':-7.62}
Then calling the function as:
H_abs_W(list_1,Dict_1)
Should give the output:
'apples',10.46
EDIT:
I managed to do it in the end with the code below. Looking over the answers, turns out I could have done it in a shorter fashion, lol.
def H_abs_W(list_1,Dict_1):
freqW = {}
for char in list_1:
if char in freqW:
freqW[char] += 1
else:
freqW[char] = 1
ASum_W = 0
i_word = ''
for a,b in freqW.items():
x = 0
d = Dict_1.get(a,0)
x = abs(float(b)*float(d))
if x > ASum_W:
ASum_W = x
i_word = a
return(i_word,ASum_W)
list_1 = ['apples','oranges','pears','apples']
Dict_1 = {'apples':5.23,'pears':-7.62}
d = {k:0 for k in list_1}
for x in list_1:
if x in Dict_1.keys():
d[x]+=Dict_1[x]
m = max(Dict_1, key=Dict_1.get)
print(m,Dict_1[m])
try this,
key, value = sorted(Dict_1.items(), key = lambda x : x[1], reverse=True)[0]
print(f"{key}, {list_1.count(key) * value}")
# apples, 10.46
you can use Counter to calculate the frequency(number of occurrences) of each item in the list.
max(counter.values()) will give us the count of maximum occurring element
max(counter, key=counter.get) will give the which item in the list is
associated with that highest count.
========================================================================
from collections import Counter
def H_abs_W(list_1, Dict_1):
counter = Counter(list_1)
count = max(counter.values())
item = max(counter, key=counter.get)
return item, abs(count * Dict_1.get(item))

Word Ladder without replacement in python

I have question, where I need to implement ladder problem with different logic.
In each step, the player must either add one letter to the word
from the previous step, or take away one letter, and then rearrange the letters to make a new word.
croissant(-C) -> arsonist(-S) -> aroints(+E)->notaries(+B)->baritones(-S)->baritone
The new word should make sense from a wordList.txt which is dictionary of word.
Dictionary
My code look like this,
where I have calculated first the number of character removed "remove_list" and added "add_list". Then I have stored that value in the list.
Then I read the file, and stored into the dictionary which the sorted pair.
Then I started removing and add into the start word and matched with dictionary.
But now challenge is, some word after deletion and addition doesn't match with the dictionary and it misses the goal.
In that case, it should backtrack to previous step and should add instead of subtracting.
I am looking for some sort of recursive function, which could help in this or complete new logic which I could help to achieve the output.
Sample of my code.
start = 'croissant'
goal = 'baritone'
list_start = map(list,start)
list_goal = map(list, goal)
remove_list = [x for x in list_start if x not in list_goal]
add_list = [x for x in list_goal if x not in list_start]
file = open('wordList.txt','r')
dict_words = {}
for word in file:
strip_word = word.rstrip()
dict_words[''.join(sorted(strip_word))]=strip_word
file.close()
final_list = []
flag_remove = 0
for i in remove_list:
sorted_removed_list = sorted(start.replace(''.join(map(str, i)),"",1))
sorted_removed_string = ''.join(map(str, sorted_removed_list))
if sorted_removed_string in dict_words.keys():
print dict_words[sorted_removed_string]
final_list.append(sorted_removed_string)
flag_remove = 1
start = sorted_removed_string
print final_list
flag_add = 0
for i in add_list:
first_character = ''.join(map(str,i))
sorted_joined_list = sorted(''.join([first_character, final_list[-1]]))
sorted_joined_string = ''.join(map(str, sorted_joined_list))
if sorted_joined_string in dict_words.keys():
print dict_words[sorted_joined_string]
final_list.append(sorted_joined_string)
flag_add = 1
sorted_removed_string = sorted_joined_string
Recursion-based backtracking isn't a good idea for search problem of this sort. It blindly goes downward in search tree, without exploiting the fact that words are almost never 10-12 distance away from each other, causing StackOverflow (or recursion limit exceeded in Python).
The solution here uses breadth-first search. It uses mate(s) as helper, which given a word s, finds all possible words we can travel to next. mate in turn uses a global dictionary wdict, pre-processed at the beginning of the program, which for a given word, finds all it's anagrams (i.e re-arrangement of letters).
from queue import Queue
words = set(''.join(s[:-1]) for s in open("wordsEn.txt"))
wdict = {}
for w in words:
s = ''.join(sorted(w))
if s in wdict: wdict[s].append(w)
else: wdict[s] = [w]
def mate(s):
global wdict
ans = [''.join(s[:c]+s[c+1:]) for c in range(len(s))]
for c in range(97,123): ans.append(s + chr(c))
for m in ans: yield from wdict.get(''.join(sorted(m)),[])
def bfs(start,goal,depth=0):
already = set([start])
prev = {}
q = Queue()
q.put(start)
while not q.empty():
cur = q.get()
if cur==goal:
ans = []
while cur: ans.append(cur);cur = prev.get(cur)
return ans[::-1] #reverse the array
for m in mate(cur):
if m not in already:
already.add(m)
q.put(m)
prev[m] = cur
print(bfs('croissant','baritone'))
which outputs: ['croissant', 'arsonist', 'rations', 'senorita', 'baritones', 'baritone']

Split list based on first character - Python

I am new to Python and can't quite figure out a solution to my Problem. I would like to split a list into two lists, based on what the list item starts with. My list looks like this, each line represents an item (yes this is not the correct list notation, but for a better overview i'll leave it like this) :
***
**
.param
+foo = bar
+foofoo = barbar
+foofoofoo = barbarbar
.model
+spam = eggs
+spamspam = eggseggs
+spamspamspam = eggseggseggs
So I want a list that contains all lines starting with a '+' between .param and .model and another list that contains all lines starting with a '+' after model until the end.
I have looked at enumerate() and split(), but since I have a list and not a string and am not trying to match whole items in the list, I'm not sure how to implement them.
What I have is this:
paramList = []
for line in newContent:
while line.startswith('+'):
paramList.append(line)
if line.startswith('.'):
break
This is just my try to create the first list. The Problem is, the code reads the second block of '+'s as well because break just Exits the while Loop, not the for Loop.
I hope you can understand my question and thanks in advance for any pointers!
What you want is really a simple task that can be accomplish using list slices and list comprehension:
data = ['**','***','.param','+foo = bar','+foofoo = barbar','+foofoofoo = barbarbar',
'.model','+spam = eggs','+spamspam = eggseggs','+spamspamspam = eggseggseggs']
# First get the interesting positions.
param_tag_pos = data.index('.param')
model_tag_pos = data.index('.model')
# Get all elements between tags.
params = [param for param in data[param_tag_pos + 1: model_tag_pos] if param.startswith('+')]
models = [model for model in data[model_tag_pos + 1: -1] if model.startswith('+')]
print(params)
print(models)
Output
>>> ['+foo = bar', '+foofoo = barbar', '+foofoofoo = barbarbar']
>>> ['+spam = eggs', '+spamspam = eggseggs']
Answer to comment:
Suppose you have a list containing numbers from 0 up to 5.
l = [0, 1, 2, 3, 4, 5]
Then using list slices you can select a subset of l:
another = l[2:5] # another is [2, 3, 4]
That what we are doing here:
data[param_tag_pos + 1: model_tag_pos]
And for your last question: ...how does python know param are the lines in data it should iterate over and what exactly does the first paramin param for paramdo?
Python doesn't know, You have to tell him.
First param is a variable name I'm using here, it cuold be x, list_items, whatever you want.
and I will translate the line of code to plain english for you:
# Pythonian
params = [param for param in data[param_tag_pos + 1: model_tag_pos] if param.startswith('+')]
# English
params is a list of "things", for each "thing" we can see in the list `data`
from position `param_tag_pos + 1` to position `model_tag_pos`, just if that "thing" starts with the character '+'.
data = {}
for line in newContent:
if line.startswith('.'):
cur_dict = {}
data[line[1:]] = cur_dict
elif line.startswith('+'):
key, value = line[1:].split(' = ', 1)
cur_dict[key] = value
This creates a dict of dicts:
{'model': {'spam': 'eggs',
'spamspam': 'eggseggs',
'spamspamspam': 'eggseggseggs'},
'param': {'foo': 'bar',
'foofoo': 'barbar',
'foofoofoo': 'barbarbar'}}
I am new to Python
Whoops. Don't bother with my answer then.
I want a list that contains all lines starting with a '+' between
.param and .model and another list that contains all lines starting
with a '+' after model until the end.
import itertools as it
import pprint
data = [
'***',
'**',
'.param',
'+foo = bar',
'+foofoo = barbar',
'+foofoofoo = barbarbar',
'.model',
'+spam = eggs',
'+spamspam = eggseggs',
'+spamspamspam = eggseggseggs',
]
results = [
list(group) for key, group in it.groupby(data, lambda s: s.startswith('+'))
if key
]
pprint.pprint(results)
print '-' * 20
print results[0]
print '-' * 20
pprint.pprint(results[1])
--output:--
[['+foo = bar', '+foofoo = barbar', '+foofoofoo = barbarbar'],
['+spam = eggs', '+spamspam = eggseggs', '+spamspamspam = eggseggseggs']]
--------------------
['+foo = bar', '+foofoo = barbar', '+foofoofoo = barbarbar']
--------------------
['+spam = eggs', '+spamspam = eggseggs', '+spamspamspam = eggseggseggs']
This thing here:
it.groupby(data, lambda x: x.startswith('+')
...tells python to create groups from the strings according to their first character. If the first character is a '+', then the string gets put into a True group. If the first character is not a '+', then the string gets put into a False group. However, there are more than two groups because consecutive False strings will form a group, and consecutive True strings will form a group.
Based on your data, the first three strings:
***
**
.param
will create one False group. Then, the next strings:
+foo = bar
+foofoo = barbar
+foofoofoo = barbarbar
will create one True group. Then the next string:
'.model'
will create another False group. Then the next strings:
'+spam = eggs'
'+spamspam = eggseggs'
'+spamspamspam = eggseggseggs'
will create another True group. The result will be something like:
{
False: [strs here],
True: [strs here],
False: [strs here],
True: [strs here]
}
Then it's just a matter of picking out each True group: if key, and then converting the corresponding group to a list: list(group).
Response to comment:
where exactly does python go through data, like how does it know s is
the data it's iterating over?
groupby() works like do_stuff() below:
def do_stuff(items, func):
for item in items:
print func(item)
#Create the arguments for do_stuff():
data = [1, 2, 3]
def my_func(x):
return x + 100
#Call do_stuff() with the proper argument types:
do_stuff(data, my_func) #Just like when calling groupby(), you provide some data
#and a function that you want applied to each item in data
--output:--
101
102
103
Which can also be written like this:
do_stuff(data, lambda x: x + 100)
lambda creates an anonymous function, which is convenient for simple functions which you don't need to refer to by name.
This list comprehension:
[
list(group)
for key, group in it.groupby(data, lambda s: s.startswith('+'))
if key
]
is equivalent to this:
results = []
for key, group in it.groupby(data, lambda s: s.startswith('+') ):
if key:
results.append(list(group))
It's clearer to explicitly write a for loop, however list comprehensions execute much faster. Here is some detail:
[
list(group) #The item you want to be in the results list for the current iteration of the loop here:
for key, group in it.groupby(data, lambda s: s.startswith('+')) #A for loop
if key #Only include the item for the current loop iteration in the results list if key is True
]
I would suggest doing things step by step.
1) Grab every word from the array separately.
2) Grab the first letter of the word.
3) Look if that is a '+' or '.'
Example code:
import re
class Dark():
def __init__(self):
# Array
x = ['+Hello', '.World', '+Hobbits', '+Dwarves', '.Orcs']
xPlus = []
xDot = []
# Values
i = 0
# Look through every word in the array one by one.
while (i != len(x)):
# Grab every word (s), and convert to string (y).
s = x[i:i+1]
y = '\n'.join(s)
# Print word
print(y)
# Grab the first letter.
letter = y[:1]
if (letter == '+'):
xPlus.append(y)
elif (letter == '.'):
xDot.append(y)
else:
pass
# Add +1
i = i + 1
# Print lists
print(xPlus)
print(xDot)
#Run class
Dark()

Finding number of repeats inside a list in a file with Python

I need to find the number of times an entry in the list repeats consecutively. For example, consider the following file
"hello hello [A B C]"
"my world [D C F L]"
"tick tock [A L]"
In this file, the number of times C repeats is 2
A repeat is not counted as it is not repeating consecutively.
I am not sure of using re as it wouldnt tell me if it repeats consecutively. Any help would be apprecited.
the most simple way is to use re to parse the file.
regular expression that could work : \[([A-Z]\s)+[A-Z]\]
then with the list of "list string" (aka ["[A B C]","[ F G R]"] ) convert it to a list.
the format must be like this for "[A B C]" "ABC", so remove spaces and [] for each one.
converted_string_list = list(str_list)
so a print converted_string_list will result in a list like this one for a string like "ADF":
['A', 'D', 'F']
then merge all list and find duplicates.
this is straigh forward solution! I am sure a better solution exists
For counting the duplicates once you get them into a list:
initial_length = len(my_list)
new_length = len(set(my_list))
duplicates = initial_length - new_length
def find_repeats_in_list(lines):
# get lists from every line
all_items = []
for line in lines:
open_bracket = line.index('[')
close_bracket = line.index(']')
items = line[open_bracket+1:close_bracket].split()
all_items.append(items)
# initialize dictionaries to hold consecutive counts
counts = dict()
final = dict()
# seed counts with list from first line
for item in all_items[0]:
counts[item] = 1
# check for first line list items in subsequent lines
for items in all_items[1:]:
for counted in counts:
remove = []
if counted not in items: # not in current line, stop counting
remove.append(counted)
if counts[counted] > 1: # but put in final if more than one
final[counted] = counts[counted]
for item in remove:
del counts[item]
for item in items: # now increment anything consecutive
if item in counts:
counts[item] += 1
return final

Categories

Resources