How to merge all intersecting tuples in a list? [duplicate] - python

Consider the following list:
tuple_list = [('c', 'e'), ('c', 'd'), ('a', 'b'), ('d', 'e')]
How can I achieve this?
new_tuple_list = [('c', 'e', 'd'), ('a', 'b')]
I have tried:
for tuple in tuple_list:
for tup in tuple_list:
if tuple[0] == tup[0]:
new_tup = (tuple[0],tuple[1],tup[1])
new_tuple_list.append(new_tup)
But it only works if I have the elements of the tuple in a certain order which means it will result in this instead:
new_tuple_list = [('c', 'e', 'd'), ('a', 'b'), ('d', 'e')]

You could consider the tuples as edges in a graph and your goal as finding connected components within the graph. Then you could simply loop over vertices (items in tuples) and for each vertex you haven't visited yet execute DFS to generate a component:
from collections import defaultdict
def dfs(adj_list, visited, vertex, result, key):
visited.add(vertex)
result[key].append(vertex)
for neighbor in adj_list[vertex]:
if neighbor not in visited:
dfs(adj_list, visited, neighbor, result, key)
edges = [('c', 'e'), ('c', 'd'), ('a', 'b'), ('d', 'e')]
adj_list = defaultdict(list)
for x, y in edges:
adj_list[x].append(y)
adj_list[y].append(x)
result = defaultdict(list)
visited = set()
for vertex in adj_list:
if vertex not in visited:
dfs(adj_list, visited, vertex, result, vertex)
print(result.values())
Output:
[['a', 'b'], ['c', 'e', 'd']]
Note that in above both the components and elements within a component are in random order.

If you don't need duplicate values (the ability to preserve ['a', 'a', 'b'], for example), this is a simple and fast way to do what you want via sets:
iset = set([frozenset(s) for s in tuple_list]) # Convert to a set of sets
result = []
while(iset): # While there are sets left to process:
nset = set(iset.pop()) # Pop a new set
check = len(iset) # Does iset contain more sets
while check: # Until no more sets to check:
check = False
for s in iset.copy(): # For each other set:
if nset.intersection(s): # if they intersect:
check = True # Must recheck previous sets
iset.remove(s) # Remove it from remaining sets
nset.update(s) # Add it to the current set
result.append(tuple(nset)) # Convert back to a list of tuples
gives
[('c', 'e', 'd'), ('a', 'b')]

This has a bad performance because list-contains checks are O(n) but it's quite short:
result = []
for tup in tuple_list:
for idx, already in enumerate(result):
# check if any items are equal
if any(item in already for item in tup):
# tuples are immutable so we need to set the result item directly
result[idx] = already + tuple(item for item in tup if item not in already)
break
else:
# else in for-loops are executed only if the loop wasn't terminated by break
result.append(tup)
This has the nice side-effect that the order is kept:
>>> result
[('c', 'e', 'd'), ('a', 'b')]

I had that problem with sets so I'm contributing my solution to this. It combines sets with one of more common element as long as possible.
My example data:
data = [['A','B','C'],['B','C','D'],['D'],['X'],['X','Y'],['Y','Z'],['M','N','O'],['M','N','O'],['O','A']]
data = list(map(set,data))
My code to solve the problem:
oldlen = len(data)+1
while len(data)<oldlen:
oldlen = len(data)
for i in range(len(data)):
for j in range(i+1,len(data)):
if len(data[i]&data[j]):
data[i] = data[i]|data[j]
data[j] = set()
data = [data[i] for i in range(len(data)) if data[i]!= set()]
Result:
[{'A', 'B', 'C', 'D', 'M', 'N', 'O'}, {'X', 'Y', 'Z'}]

The task becomes trivial with NetworkX, library for graphs manipulation. Similar to this answer by #niemmi you'd need to find the connected components:
import networkx as nx
tuple_list = [('c', 'e'), ('c', 'd'), ('a', 'b'), ('d', 'e')]
graph = nx.Graph(tuple_list)
result = list(nx.connected_components(graph))
print(result)
# [{'e', 'c', 'd'}, {'b', 'a'}]
To get the result as a list of tuples:
result = list(map(tuple, nx.connected_components(G)))
print(result)
# [('d', 'e', 'c'), ('a', 'b')]

Use sets. You are checking for overlap and accumulation of (initially small) sets, and Python has a data type for that:
#!python3
#tuple_list = [('c', 'e'), ('c', 'd'), ('a', 'b'), ('d', 'e')]
tuple_list = [(1,2), (3,4), (5,), (1,3,5), (3,'a'),
(9,8), (7,6), (5,4), (9,'b'), (9,7,4),
('c', 'e'), ('e', 'f'), ('d', 'e'), ('d', 'f'),
('a', 'b'),
]
set_list = []
print("Tuple list:", tuple_list)
for t in tuple_list:
#print("Set list:", set_list)
tset = set(t)
matched = []
for s in set_list:
if tset & s:
s |= tset
matched.append(s)
if not matched:
#print("No matches. New set: ", tset)
set_list.append(tset)
elif len(matched) > 1:
#print("Multiple Matches: ", matched)
for i,iset in enumerate(matched):
if not iset:
continue
for jset in matched[i+1:]:
if iset & jset:
iset |= jset
jset.clear()
set_list = [s for s in set_list if s]
print('\n'.join([str(s) for s in set_list]))

I bumped into this problem when resolving coreferences, I need to merge sets in a list of sets that have common elements:
import copy
def merge(list_of_sets):
# init states
list_of_sets = copy.deepcopy(list_of_sets)
result = []
indices = find_fist_overlapping_sets(list_of_sets)
while indices:
# Keep other sets
result = [
s
for idx, s in enumerate(list_of_sets)
if idx not in indices
]
# Append merged set
result.append(
list_of_sets[indices[0]].union(list_of_sets[indices[1]])
)
# Update states
list_of_sets = result
indices = find_fist_overlapping_sets(list_of_sets)
return list_of_sets
def find_fist_overlapping_sets(list_of_sets):
for i, i_set in enumerate(list_of_sets):
for j, j_set in enumerate(list_of_sets[i+1:]):
if i_set.intersection(j_set):
return i, i+j+1

Related

Find longest combination of non-overlapping pairs

I need to find the length of the longest combination of pairs that can be made from a list of pairs, without any common elements.
For example the following list of pairs:
[(A, B), (A, D), (B, C), (B, D), (C, D)]
Would have these combinations:
[(A, B), (C, D)]
[(A, D), (B, C)]
[(B, D)]
And so the longest combination would be 2 pairs in length.
This needs to be able to handle up to several thousand pairs so generating all possible combinations of pairs at each possible length and checking for overlaps would not work.
However, the total number of unique elements across all pairs is capped at 100, so the longest possible combination that could be encountered would be 50 pairs.
Is there an efficient way to do this?
okay this is what I have, maybe not the best but its something
so Combo initializes any 2 pairs, and feeds it to Combine along with the rest of the array not check yet
Combine takes an the leftover array, the current combo and a list of used elements, then check each possible combination, if the check tuple from the leftover array has any elements in the used list, it skips it, if it doesnt, it adds it to the combo and passes it to a further recursed Combine until its as long as it can be
arr = [('A', 'B'), ('A', 'D'), ('B', 'C'), ('B', 'D'), ('E', 'D'), ("A",'F'),('J','K'),('M','K'),('K','D'),('B','F')]
def Combo(arr):
combos = []
for i, tup1 in enumerate(arr):
combo = [tup1]
used = [tup1[0], tup1[1]]
for j, tup2 in enumerate(arr[i:]):
if (tup2[0] in used) or (tup2[1] in used):
continue
else:
for el in tup2:
used.append(el)
combo.append(tup2)
combo=Combine(arr[j:], combo, used)
combos.append(combo)
return combos
def Combine(arr, combo, used):
if arr==[]:
return combo
for i, tup in enumerate(arr):
unique = True
for el in tup:
if el in used:
unique = False
continue
if unique:
combo.append(tup)
for el in tup:
used.append(el)
return Combine(arr[i:], combo, used)
return combo
Combo(arr)
OUTPUT
[[('A', 'B'), ('E', 'D'), ('J', 'K')],
[('A', 'D'), ('B', 'C'), ('J', 'K')],
[('B', 'C'), ('E', 'D'), ('A', 'F'), ('J', 'K')],
[('B', 'D'), ('A', 'F'), ('J', 'K')],
[('E', 'D'), ('A', 'F'), ('B', 'C'), ('J', 'K')],
[('A', 'F'), ('J', 'K'), ('B', 'C'), ('E', 'D')],
[('J', 'K'), ('B', 'F'), ('E', 'D')],
[('M', 'K'), ('B', 'F'), ('E', 'D')],
[('K', 'D'), ('B', 'F')]]
as far as I know this should give you each unique combination in a list
Rephrasing the question, we want to find the biggest set of non-overlapping elements of pairs. Probably not the best solution but should work:
def process(pairs):
output = {}
max_length = 0
for i in range(len(pairs)):
curr = 1
output[pairs[i]] = set(pairs[i])
rest = pairs[:i] + pairs[i + 1:]
for j in range(len(rest)):
subset = output[pairs[i]] | set(rest[j])
if len(subset) == len(output[pairs[i]]) + 2:
curr += 1
output[pairs[i]] = subset
max_length = max(curr, max_length)
return max_length
We populate our initial set with the current pair and then if the next pair's elements are not presented in the current set we extend it. We continue this process until we checked all remaining pairs. I used this function for testing:
import random, timeit
def get_random_pairs(num):
return [(random.choice(string.ascii_uppercase), random.choice(string.ascii_uppercase)) for _ in range(num)]
print(timeit.timeit('process(pairs)', number=5, setup="from __main__ import process,get_random_pairs; pairs = get_random_pairs(3000)")/5)
On my machine (Intel i7-9750H (12) # 4.500GHz) it takes about 5-6 seconds to process 3000 pairs.

nested value from nested index python

I am trying to get an output list from nested list based on nested indices.
Input:
list_a = [(a,b,c,d), (f,g), (n,p,x)]
sub_index_a = [(0,2),(1),(0,1)]
Output:
output_list = [(a,c), (g), (n,p)]
list_a = [('a', 'b', 'c', 'd'), ('f', 'g'), ('n', 'p', 'x')]
sub_index_a = [(0, 2), (1,), (0, 1)]
def check(i, e):
r = []
for ee in e:
r.append(list_a[i][ee])
return tuple(r)
outputlist = [check(i, e) for i, e in enumerate(sub_index_a)]
print(outputlist)
This evaluates to
[('a', 'c'), ('g',), ('n', 'p')]
well having ("g") just evaluates to "g",the actual tuple of that would look like ("g",) (tuple(["g"])), same as (1) but I think found a half-decent workaround? Hopefully, this is your desired solution.
list_a = [('a','b','c','d'), ('f','g'), ('n','p','x')]
sub_index_a = [(0,2),(1),(0,1)]
print([tuple([list_a[x][indx] for indx in i]) if type(i) in [tuple, list] else list_a[x][i] for x,i in enumerate(sub_index_a)])
[('a', 'c'), 'g', ('n', 'p')]
if you want everything returned as a tuple you can modify the comprehension to:
print([tuple([list_a[x][indx] for indx in i]) if type(i) in [tuple, list] else tuple([list_a[x][i]]) for x,i in enumerate(sub_index_a)])
[('a', 'c'), ('g',), ('n', 'p')]
note:
if you want everything to be nested (with a single element) you would want a list of lists; E.g. [[0,2],[1],[0,1]]
Use zip and a nested comprehension:
list_a = [("a","b","c","d"), ("f","g"), ("n","p","x")]
sub_index_a = [(0,2),(1,),(0,1)] # note the comma in the second tuple
output_list = [tuple(sub[i] for i in i_s) for sub, i_s in zip(list_a, sub_index_a)]
# [('a', 'c'), ('g',), ('n', 'p')]

Convert list of tuples such that [(a,b,c)] converts to [(a,b),(a,c)]

Thoughts on how I would do this? I want the first value in the tuple to pair with each successive value. This way each resulting tuple would be a pair starting with the first value.
I need to do this: [(a,b,c)] --> [(a,b),(a,c)]
You can try this.
(t,)=[('a','b','c')]
[(t[0],i) for i in t[1:]]
# [('a', 'b'), ('a', 'c')]
Using itertools.product
it=iter(('a','b','c'))
list(itertools.product(next(it),it))
# [('a', 'b'), ('a', 'c')]
Using itertools.repeat
it=iter(('a','b','c'))
list(zip(itertools.repeat(next(it)),it))
# [('a', 'b'), ('a', 'c')]
a = [('a','b','c')]
a = a[0]
a = [tuple([a[0], a[index]]) for index in range(1, len(a))]
Try this !
A solution that uses itertools's combinations module.
from itertools import combinations
arr = (['a','b','c'])
for i in list(combinations(arr, 2)):
if(i[0]==arr[0]):
print(i ,end = " ")
This would give a solution ('a', 'b') ('a', 'c')
You can just append pairs of tuples to a list:
original = [(1,2,3)]
def makePairs(lis):
ret = []
for t in lis:
ret.append((t[0],t[1]))
ret.append((t[0],t[2]))
return ret
print(makePairs(original))
Output:
[(1, 2), (1, 3)]
If your tuples are arbitrary length you can write a simple generator:
def make_pairs(iterable):
iterator = iter(iterable)
first = next(iterator)
for item in iterator:
yield first, item
example result:
my_tuple = ('a', 'b', 'c', 'd')
list(make_pairs(my_tuple))
Out[170]: [('a', 'b'), ('a', 'c'), ('a', 'd')]
This is a memory-efficient solution.

Find tuple in list with same first item and return another list

I have a list like this in Python:
[('a', 'b'), ('a', 'c'),('d','f')]
and I want join items that have same first item and result like this:
[('a', 'b', 'c'),('d','f')]
Here is one way to do it. For efficiency, we build a dict with the first value as key. We keep the values in the order in which they appear (and the tuples in their original order as well, if you use Python >= 3.7 - otherwise you will have to use a collections.OrderedDict)
def join_by_first(sequences):
out = {}
for seq in sequences:
try:
out[seq[0]].extend(seq[1:])
except KeyError:
out[seq[0]] = list(seq)
return [tuple(values) for values in out.values()]
join_by_first([('a', 'b'), ('a', 'c'),('d','f')])
# [('a', 'b', 'c'), ('d', 'f')]
You can not edit tuples - the are immuteable. You can use lists and convert all back to tuples afterward:
data = [('a', 'b'), ('a', 'c'),('d','f')]
new_data = []
for d in data # loop over your data
if new_data and new_data[-1][0] == d[0]: # if something in new_data and 1st
new_data[-1].extend(d[1:]) # ones are identical: extend
else:
new_data.append( [a for a in d] ) # not same/nothing in: add items
print(new_data) # all are lists
new_data = [tuple(x) for x in new_data]
print(new_data) # all are tuples again
Output:
[['a', 'b', 'c'], ['d', 'f']] # all are lists
[('a', 'b', 'c'), ('d', 'f')] # all are tuples again
See Immutable vs Mutable types
I feel like the simplest solution is to build a dictionary in which:
keys are the first items in the tuples
values are lists comporting all second items from the tuples
Once we have that we can then build the output list:
from collections import defaultdict
def merge(pairs):
mapping = defaultdict(list)
for k, v in pairs:
mapping[k].append(v)
return [(k, *v) for k, v in mapping.items()]
pairs = [('a', 'b'), ('a', 'c'),('d','f')]
print(merge(pairs))
This outputs:
[('a', 'b', 'c'), ('d', 'f')]
This solution is in O(n) as we only iterate two times over each item from pairs.

Checking that the geometry for a triangle is contained in a list of lines

I have a list of lines Lines=([('B', 'C'), ('D', 'A'), ('D', 'C'), ('A', 'B'), ('D', 'B')]) and geometry = ('B', 'C', 'D') is a list of points that set up the triangle (B,C,D).
I want to check whether geometry can be set up from list of lines in Lines. How can I create a function to check that status? True or False.
Sample Functionality with input Lines:
>> Lines=([('B', 'C'), ('D', 'A'), ('D', 'C'), ('A', 'B'), ('D', 'B'),])
>> geometry1 = ('B', 'C', 'D')
>> check_geometry(Lines, geometry1)
True
>> geometry2 = ('A', 'B', 'E')
>> check_geometry(Lines, geometry2)
False
This is my code, but the result is wrong:
import itertools
def check_geometry(line, geometry):
dataE = [set(x) for x in itertools.combinations(geometry, 2)]
for data in dataE:
if data not in line:
return False
return True
Lines = [('B', 'C'), ('D', 'A'), ('D', 'C'), ('A', 'B'), ('D', 'B'),]
geometry1 = ('B', 'C', 'D')
print check_geometry(Lines, geometry1)
Output:
False
For triangles:
You could use the built-in all to do this, making sure to first sort the list contents since their order might differ than that generated from itertools.combinations:
sLines = [tuple(sorted(l)) for l in Lines]
dataE = itertools.combinations('BCD', 2)
Now you can call all which will check that every value in dataE is present in sLines:
all(l1 in sLines for l1 in dataE)
Which will return True.
So, your check_geometry function could look something like:
def check_geometry(line, geometry):
sLines = [tuple(sorted(l)) for l in line]
dataE = itertools.combinations(geometry, 2)
return all(l1 in sLines for l1 in dataE)
Calls made will now check if the Lines contain the geometry:
check_geometry(Lines, 'BCD')
# returns True
check_geometry(Lines, 'ABE')
# returns False
A bit more general:
To generalize this a bit, we can drop itertools.combinations and instead utilize zip. The following makes some appropriate changes to the function in order to acommodate zip but performs similar stuff:
def check_geometry(line, geometry):
sLines = [sorted(l) for l in line]
dataE = [sorted(x) for x in zip(geometry, geometry[1:] + geometry[:1])]
return all(l1 in sLines for l1 in dataE)
The key difference here is:
dataE is now a list of lists containing the result of zip(geometry, geometry[1:] + geometry[:1]). What zip does in this case is it takes a string like "BCDA" and the same string with the first element added to the end geometry[1:] + geometry[:1] (i.e "CDAB") and creates entries signifying the sides of a shape:
>>> s = "BCDA"
>>> s[1:] + s[:1]
>>> 'CDAB'
>>> list(zip(s, s[1:] + s[:1]))
[('B', 'C'), ('C', 'D'), ('D', 'A'), ('A', 'B')]
Now we can check that a geometry with points "BCDA" can be constructed by the lines in Lines:
check_geometry(Lines, "BCD")
# True
check_geometry(Lines, "BCDA")
# True
check_geometry(Lines, "BCDF")
# False
Note 1: Lines can be written as:
Lines=[('B', 'C'), ('D', 'A'), ('D', 'C'), ('A', 'B'), ('D', 'B')]
The parenthesis () and comma , have no additional effect here, you can drop them :-) .
Note 2: The geometry parameter for check_geometry can be any iterable (tuples, lists, strings):
check_geometry(lines, "BCD") == check_geometry(lines, ('B', 'C', 'D'))
Creating and passing a tuple to it seems somewhat odd in this case (alas, you might have a good reason to do so). Unless reasons require it, I would suggest going with strings as the value for parameter geometry.
I think A,B,C can be string or whatever which define a point that set up a line
Okay, I'll be using strings for my answer then, you should be able to adjust the code to your needs.
def check_for_triangle(tri, lines):
lines_needed = zip(tri, (tri[1], tri[2], tri[0]))
return all(line in lines or line[::-1] in lines for line in lines_needed)
lines=[('B', 'C'), ('D', 'A'), ('D', 'C'), ('A', 'B'), ('D', 'B')]
tri1 = ('B', 'C', 'D')
tri2 = ('A', 'B', 'E')
print(check_for_triangle(tri1, lines)) # True
print(check_for_triangle(tri2, lines)) # False
The idea is to generate all lines (represented by a pair of points) we need to find in lines for a given triangle with zip. After that, we check whether all these lines can be found in lines.
Checking for line[::-1] as well is needed because the line ('A', 'B') is the same line as ('B', 'A').

Categories

Resources