Merging overlapping (str) objects

Merging overlapping (str) objects - python

The problem is the following :
I want to go from having this set
{'A/B', 'B/C', 'C/D', 'D/E', ..., 'U/V', 'V/W', ..., 'X/Y', ..., 'Z', ...}
to this set
{'A/B/C/D/E', ..., 'U/V/W', ..., 'X/Y', ..., 'Z', ...}
where the objects A, B, C ... are just strings of characters. The output solution should be independent of the order in which the objects appears (i.e. if you scramble the objects in the set, the solution should always be the same)
In other words I want to merge overlapping objects.
Inputs of the following form cannot happen :
{"A/B", "B/C", "B/D"}
{"A/B", "B/C", "C/A"}
There can be objects with no '/' in them.
Here is a partial solution I've come up with :
example={'A/B', 'B/C', 'C/D', 'D/E','U/V', 'V/W','X/Y'}
def ext_3plus(unit):
for couple in list(itertools.combinations(list(unit),2)):
if '/' in couple[0] and '/' in couple[1]:
if couple[0].split('/')[0]==couple[1].split('/')[1]:
unit.remove(couple[0])
unit.remove(couple[1])
unit.add(couple[1].split('/')[0]+'/'+couple[0])
if couple[0].split('/')[1]==couple[1].split('/')[0]:
unit.remove(couple[0])
unit.remove(couple[1])
unit.add(couple[0]+'/'+couple[1].split('/')[1])
else: #the input can contain object not having '/'
continue
There is two problems, first it does only one iteration,
the result on {'A/B', 'B/C', 'C/D', 'D/E','U/V', 'V/W','X/Y'}
is :
{'A/B/C', 'C/D/E', 'U/V/W', 'X/Y'}
Second, if I include objects containing no '/', the input being {'A/B', 'B/C', 'C/D', 'D/E','U/V', 'V/W','X/Y','Z'}, the result is different from the previous one :
{'A/B', 'B/C/D', 'D/E', 'U/V/W', 'X/Y', 'Z'}
So there should be a recursive call on the first iteration etc.
How should it be done ?

If I understood correctly this can be seen as a graph problem, and solve as such:
import networkx as nx
example = {'A/B', 'B/C', 'C/D', 'D/E', 'U/V', 'V/W', 'X/Y', "Z"}
# convert each string to a and edge
# each pattern to the side of / is a node
edges = [tuple(s.split("/")) for s in example if "/" in s]
nodes = [s for s in example if "/" not in s]
# create directed graph from edges
g = nx.from_edgelist(edges, create_using=nx.DiGraph)
g.add_nodes_from(nodes)
# find each path using topological sort
runs, current = [], []
for e in nx.topological_sort(g):
# start a new path each time a node with in-degree 0
# in-degree 0 means it is the start of a new path
if g.in_degree(e) == 0:
if current:
runs.append(current)
current = []
current.append(e)
if current:
runs.append(current)
# format the result
result = ["/".join(run) for run in runs]
print(result)
Output
['Z', 'U/V/W', 'X/Y', 'A/B/C/D/E']
If I'm not mistaken the overall complexity of this approach is O(n). More on topological sorting can be found here.
UPDATE
In networkx 2.6.4 use lexicographical_topological_sort

You can use a recursive generator function:
vals = ['A/B', 'B/C', 'C/D', 'D/E', 'U/V', 'V/W', 'X/Y']
data = [i.split('/') for i in vals]
def paths(d, c = [], s = []):
if not (k:=[b for a, b in data if a == d]):
yield c+[d]
if (t:=[a for a, b in data if a not in s+[d]]):
yield from paths(t[0], c = [], s=s+[d])
else:
yield from [j for i in k for j in paths(i, c=c+[d], s=s+[d])]
vals = list(paths(data[0][0]))
Output:
[['A', 'B', 'C', 'D', 'E'], ['U', 'V', 'W'], ['X', 'Y']]
It should be noted, however, that the solution above will only work on inputs that contain standard edge definitions. If the contents of vals can very in the number of items divided by the /, then you can use the solution below:
class Node:
def __init__(self, n, c = []):
self.n, self.c = n, c
def __contains__(self, e):
return e[0] == self.n or e[-1] == self.n or any(e in i for i in self.c)
def add_edge(self, e):
if self.n != e[0] and len(e) > 1 and (m:=[i for i in self.c if i.n == e[-1]]):
self.c = [i for i in self.c if i != m[0]]+[Node(e[0], [m[0]])]
elif self.n == e[0]:
if len(e) > 1 and not any(i.n == e[1] for i in self.c):
self.c = [*self.c, Node(e[1])]
elif (m:=[i for i in self.c if e in i]):
m[0].add_edge(e)
else:
self.c = [*self.c, Node(e[0], [] if len(e) == 1 else [Node(e[1])])]
vals = ['A/B/C', 'A/B', 'B/C', 'C/D', 'D/E', 'U/V', 'V/W', 'X/Y', 'K']
n = Node(None)
for i in vals:
k = i.split('/')
for j in range(len(k)):
n.add_edge(k[j:j+2])
def get_paths(n, c = []):
if not n.c:
yield c+[n.n]
else:
yield from [j for k in n.c for j in get_paths(k, c+[n.n])]
final_result = [i[1:] for i in get_paths(n)]
print(final_result)
Output:
[['A', 'B', 'C', 'D', 'E'], ['U', 'V', 'W'], ['X', 'Y'], ['K']]
With the trie-style approach of class Node, the order of the input (vals) does not matter (no sort is required) and input paths of any depth can be added.

It might not be the most efficient, but you could just repeat the loop until there's nothing modified.
def ext_3plus(unit):
while True:
oldlen = len(unit)
for couple in itertools.combinations(list(unit),2):
if '/' in couple[0] and '/' in couple[1]:
if couple[0].split('/')[0]==couple[1].split('/')
unit.remove(couple[0])
unit.remove(couple[1])
unit.add(couple[1].split('/')[0]+'/'+couple[0])
modified = True
if couple[0].split('/')[1]==couple[1].split('/')[0]
unit.remove(couple[0])
unit.remove(couple[1])
unit.add(couple[0]+'/'+couple[1].split('/')[1])
if len(unit) == oldlen:
# Nothing was merged, so we're done
break

Related

Topological Sort Algorithm (DFS) Implementation in Python

I am new to python and algorithms. I have been trying to implement a topological sorting algorithm for a while but can't seem to create a structure that works. The functions I have made run on a graph represented in an adj list.
When I have a DFS, the nodes are discovered top down, and nodes that have been already visited and not processed again:
def DFS(location, graph, visited = None):
if visited == None:
visited = [False for i in range(len(graph))]
if visited[location] == True:
return
visited[location] = True
node_visited.append(location)
for node in graph[location]:
DFS(node, graph, visited)
return visited
When I am trying to build a topological sort algorithm, I create a new function which essentially checks the "availability" of that node to be added to the sorted list (ie: whether its neighbouring nodes have been visited already)
def availability(graph, node):
count = 0
for neighbour in graph[node]:
if neighbour in available_nodes:
count += 1
if count != 0:
return False
return True
However, my issue is that once I have visited the node path to get to the bottom of the graph, the DFS does not allow me to revisit that those nodes. Hence, any updates I make once I discover the end of the path can not be processed.
My approach may be totally off, but I am wondering if someone could help improve my implementation design, or explain how the implementation is commonly done. Thanks in advance.

You don't need that availability check to do a topological sort with DFS.
DFS itself ensures that you don't leave a node until its children have already been processed, so if you add each node to a list when DFS finishes with it, they will be added in (reverse) topological order.
Don't forget to do the whole graph, though, like this:
def toposort(graph):
visited = [False for i in range(len(graph))]
result = []
def DFS(node):
if visited[node]:
return
visited[node] = True
for adj in graph[node]:
DFS(adj)
result.append(node)
for i in range(len(graph)):
DFS(i)
return result

class Graph:
def __init__(self):
self.edges = {}
def addNode(self, node):
self.edges[node] = []
def addEdge(self, node1, node2):
self.edges[node1] += [node2]
def getSub(self, node):
return self.edges[node]
def DFSrecu(self, start, path):
for node in self.getSub(start):
if node not in path:
path = self.DFSrecu(node, path)
if start not in path:
path += [start]
return path
def topological_sort(self, start):
topo_ordering_list = self.DFSrecu(start, [])
# this for loop it will help you to visit all nodes in the graph if you chose arbitrary node
# because you need to check if all nodes in the graph is visited and sort them
for node in g.edges:
if node not in topo_ordering_list:
topo_ordering_list = g.DFSrecu(node, topo_ordering_list)
return topo_ordering_list
if __name__ == "__main__":
g = Graph()
for node in ['S', 'B', 'A', 'C', 'G', 'I', "L", 'D', 'H']:
g.addNode(node)
g.addEdge("S", "A")
g.addEdge("S", "B")
g.addEdge("B", "D")
g.addEdge("D", "H")
g.addEdge("D", "G")
g.addEdge("H", "I")
g.addEdge("I", "L")
g.addEdge("G", "I")
last_path1 = g.topological_sort("D")
last_path2 = g.topological_sort("S")
print("Start From D: ",last_path1)
print("start From S: ",last_path2)
Output:
Start From D: ['L', 'I', 'H', 'G', 'D', 'A', 'B', 'S', 'C']
start From S: ['A', 'L', 'I', 'H', 'G', 'D', 'B', 'S', 'C']
you can see here 'C' is included in topological sorted list even it's not connect to any other node but 'C' in the graph and you need to visited her
that's way you need for loop in topological_sort() function

Intersection with order between two strings

My Question is that if we need to find the intersect between two strings?
How could we do that?
For example "address" and "dress" should return "dress".
I used a dict to implement my function, but I can only sort these characters and not output them with the original order? So how should I modify my code?
def IntersectStrings(first,second):
a={}
b={}
for c in first:
if c in a:
a[c] = a[c]+1
else:
a[c] = 1
for c in second:
if c in b:
b[c] = b[c]+1
else:
b[c] = 1
l = []
print a,b
for key in sorted(a):
if key in b:
cnt = min(a[key],b[key])
while(cnt>0):
l.append(key)
cnt = cnt-1
return ''.join(l)
print IntersectStrings('address','dress')

There are lots of intersecting strings. One way you could create a set of all substrings of each string and then intersect. If you want the biggest intersection just find the max from the resulting set, e.g.:
def substrings(s):
for i in range(len(s)):
for j in range(i, len(s)):
yield s[i:j+1]
def intersect(s1, s2):
return set(substrings(s1)) & set(substrings(s2))
Then you can see the intersections:
>>> intersect('address', 'dress')
{'re', 'ss', 'ess', 'es', 'ress', 'dress', 'dres', 'd', 'e', 's', 'res', 'r', 'dre', 'dr'}
>>> max(intersect('address', 'dress'), key=len)
'dress'
>>> max(intersect('sprinting', 'integer'), key=len)
'int'

How to count items in list recursively

I am looking to count the items in a list recursively. For example, I have a list few lists:
a = ['b', 'c', 'h']
b = ['d']
c = ['e', 'f']
h = []
I was trying to find a way in which I find out the length of list 'a'. But in list 'a' I have 'b', 'c' and 'h' ... hence my function then goes into list 'b' and counts the number of elements there... Then list 'c' and then finally list 'h'.

b = ['d']
c = ['e', 'f']
h = []
a = [b,c,h]
def recur(l):
if not l: # keep going until list is empty
return 0
else:
return recur(l[1:]) + len(l[0]) # add length of list element 0 and move to next element
In [8]: recur(a)
Out[8]: 3
Added print to help understand the output:
def recur(l,call=1):
if not l:
return 0
else:
print("l = {} and l[0] = {} on recursive call {}".format(l,l[0],call))
call+=1
return recur(l[1:],call) + len(l[0])
If you want to get more deeply nested lists you can flatten and get the len():
b = ['d']
c = ['e', 'f',['x', 'y'],["x"]]
h = []
a = [b,c,h]
from collections import Iterable
def flatten_nest(l):
if not l:
return l
if isinstance(l[0], Iterable) and not isinstance(l[0],basestring): # isinstance(l[0],str) <- python 3
return flatten_nest(l[0]) + flatten_nest(l[1:])
return l[:1] + flatten_nest(l[1:])
In [13]: len(flatten_nest(a))
Out[13]: 6

The solution that worked for me was this:
def recur(arr):
if not arr:
return 0
else:
return 1 + recur(arr[1:])

Recursive Python function to produce a list of anagrams

After a lot of head scratching and googling I still can't figure this out. I'm very new to Python and I'm struggling with the syntax. Conceptually I think I have a pretty decent idea of what I want to do and how to do so recursively. Technically however, coding it into Python however is proving to be a nightmare.
Basically I want to add all of the permutations of a word to list (no duplicate characters allowed), which can then be called by another program or function.
The return command and how to handle white space is really confusing me. I want the recursive function to "return" something once it unwinds but I don't want it to stop the function until all of the characters have iterated and all the permutations have been recursively generated within those iterations. When I run the code below nothing seems to happen.
def permutations(A, B = ''):
assert len(A) >= 0
assert len(A) == len(set(A))
res = []
if len(A) == 0: res = res.extend(B)
else:
for i in range(len(A)):
permutations(A[0:i] + A[i+1:], B + A[i])
return res
permutations('word'))
If I run the code below it prints out OK to my display pane, but I can't figure out how to get it into an output format that can be used by other program like a list.
def permutations(A, B = ''):
assert len(A) >= 0
assert len(A) == len(set(A))
if len(A) == 0: print(B)
else:
for i in range(len(A)):
permutations(A[0:i] + A[i+1:], B + A[i])
permutations('word')
Please could someone advise me on this, while I have some hair left! Very gratefully received.
Thank you
Jon

Basically your mistake is in
res = res.extend(B)
.extend() doesn't return a new list, but modifies the instance.
Another problem is that you don't use the return value from your recursive calls.
Here is one way to fix your code:
def permutations(A, B = ''):
assert len(A) >= 0
assert len(A) == len(set(A))
if len(A) == 0:
return [B]
else:
res = []
for i in range(len(A)):
res.extend(permutations(A[0:i] + A[i+1:], B + A[i]))
return res
print permutations('word')

Like this?
from itertools import permutations
a = [x for x in permutations('word')]
print a
Output:
>>[('w', 'o', 'r', 'd'), ('w', 'o', 'd', 'r'), ('w', 'r', 'o', 'd'),
>>('w', 'r', 'd', 'o'), ('w', 'd', 'o', 'r'), ('w', 'd', 'r', 'o'),
>>('o', 'w', 'r', 'd'), ..............
EDIT:
I just realized you said no duplicate characters allowed. It does not really matter for 'word', but let's say you have 'wordwwwdd'. Then you could do:
[x for x in permutations(''.join(set('wordwwwdd')))]
But it will mess up the order because of using set, so it will look like:
>> [('r', 'o', 'w', 'd'), ('r', 'o', 'd', 'w'), ('r', 'w', 'o', 'd')....

I would do it like this:
def permute_nondupe_letters_to_words(iterable):
return (''.join(i) for i in itertools.permutations(set(iterable)))
And to use it:
word = 'word'
permutation_generator = permute_nondupe_letters_to_words(word)
bucket_1, bucket_2 = [], []
for i in permutation_generator:
bucket_1.append(i)
if i == 'owdr':
break
for i in permutation_generator:
bucket_2.append(i)
And
print(len(bucket_1), len(bucket_2))
prints:
(10, 14)

Here is another way to approach this problem:
it is Python 2.7 and 3.3 compatible (have not yet tested with other versions)
it will accept input containing duplicate items, and only return unique output
(ie permutations("woozy") will only return "oowzy" once)
it returns output in sorted order (and will allow you to specify sort key and ascending or descending order)
it returns string output on string input
it runs as a generator, ie does not store all combinations in memory. If that's what you want, you have to explicitly say so (example shown below)
Edit: it occurred to me that I had omitted a length parameter, so I added one. You can now ask for things like all unique 4-letter permutations from a six-letter string.
Without further ado:
from collections import Counter
import sys
if sys.hexversion < 0x3000000:
# Python 2.x
dict_items_list = lambda d: d.items()
is_string = lambda s: isinstance(s, basestring)
rng = xrange
else:
# Python 3.x
dict_items_list = lambda d: list(d.items())
is_string = lambda s: isinstance(s, str)
rng = range
def permutations(lst, length=None, key=None, reverse=False):
"""
Generate all unique permutations of lst in sorted order
lst list of items to permute
length number of items to pick for each permutation (defaults to all items)
key sort-key for items in lst
reverse sort in reverse order?
"""
# this function is basically a shell, setting up the values
# for _permutations, which actually does most of the work
if length is None:
length = len(lst)
elif length < 1 or length > len(lst):
return [] # no possible answers
# 'woozy' => [('w', 1), ('o', 2), ('z', 1), ('y', 1)] # unknown order
items = dict_items_list(Counter(lst))
# => [('o', 2), ('w', 1), ('y', 1), ('z', 1)] # now in sorted order
items.sort(key=key, reverse=reverse)
if is_string(lst):
# if input was string, return generator of string
return (''.join(s) for s in _permutations(items, length))
else:
# return generator of list
return _permutations(items, length)
def _permutations(items, length):
if length == 1:
for item,num in items:
yield [item]
else:
for ndx in rng(len(items)):
# pick an item to start with
item, num = items[ndx]
# make new list of remaining items
if num == 1:
remaining_items = items[:ndx] + items[ndx+1:]
else:
remaining_items = items[:ndx] + [(item, num-1)] + items[ndx+1:]
# recurse against remaining items
for perm in _permutations(remaining_items, length-1):
yield [item]+perm
# test run!
words = list(permutations("woozy"))
results in
['oowyz',
'oowzy',
'ooywz',
'ooyzw',
'oozwy',
'oozyw',
'owoyz',
# ...
'zwooy',
'zwoyo',
'zwyoo',
'zyoow',
'zyowo',
'zywoo'] # 60 items = 5!/2!, as expected

Difference Between Two Lists with Duplicates in Python

I have two lists that contain many of the same items, including duplicate items. I want to check which items in the first list are not in the second list. For example, I might have one list like this:
l1 = ['a', 'b', 'c', 'b', 'c']
and one list like this:
l2 = ['a', 'b', 'c', 'b']
Comparing these two lists I would want to return a third list like this:
l3 = ['c']
I am currently using some terrible code that I made a while ago that I'm fairly certain doesn't even work properly shown below.
def list_difference(l1,l2):
for i in range(0, len(l1)):
for j in range(0, len(l2)):
if l1[i] == l1[j]:
l1[i] = 'damn'
l2[j] = 'damn'
l3 = []
for item in l1:
if item!='damn':
l3.append(item)
return l3
How can I better accomplish this task?

You didn't specify if the order matters. If it does not, you can do this in >= Python 2.7:
l1 = ['a', 'b', 'c', 'b', 'c']
l2 = ['a', 'b', 'c', 'b']
from collections import Counter
c1 = Counter(l1)
c2 = Counter(l2)
diff = c1-c2
print list(diff.elements())

Create Counters for both lists, then subtract one from the other.
from collections import Counter
a = [1,2,3,1,2]
b = [1,2,3,1]
c = Counter(a)
c.subtract(Counter(b))

To take into account both duplicates and the order of elements:
from collections import Counter
def list_difference(a, b):
count = Counter(a) # count items in a
count.subtract(b) # subtract items that are in b
diff = []
for x in a:
if count[x] > 0:
count[x] -= 1
diff.append(x)
return diff
Example
print(list_difference("z y z x v x y x u".split(), "x y z w z".split()))
# -> ['y', 'x', 'v', 'x', 'u']
Python 2.5 version:
from collections import defaultdict
def list_difference25(a, b):
# count items in a
count = defaultdict(int) # item -> number of occurrences
for x in a:
count[x] += 1
# subtract items that are in b
for x in b:
count[x] -= 1
diff = []
for x in a:
if count[x] > 0:
count[x] -= 1
diff.append(x)
return diff

Counters are new in Python 2.7.
For a general solution to substract a from b:
def list_difference(b, a):
c = list(b)
for item in a:
try:
c.remove(item)
except ValueError:
pass #or maybe you want to keep a values here
return c

you can try this
list(filter(lambda x:l1.remove(x),li2))
print(l1)

Try this one:
from collections import Counter
from typing import Sequence
def duplicates_difference(a: Sequence, b: Sequence) -> Counter:
"""
>>> duplicates_difference([1,2],[1,2,2,3])
Counter({2: 1, 3: 1})
"""
shorter, longer = sorted([a, b], key=len)
return Counter(longer) - Counter(shorter)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Merging overlapping (str) objects - python

Related

Topological Sort Algorithm (DFS) Implementation in Python

Intersection with order between two strings

How to count items in list recursively

Recursive Python function to produce a list of anagrams

Difference Between Two Lists with Duplicates in Python

Categories

Resources