Substitute list of paths with unique elements at every level (/)

Substitute list of paths with unique elements at every level (/) - python

I have a list of path strings ( which can form a tree structure ) like below:
xo = ['1/sometext1',
'1/1/sometext2',
'1/1/1/sometext3',
'1/1/2/sometext4',
'1/2/sometext5',
'1/2/1/sometext6',
'1/2/2/sometext7',
'2/sometext8',
'3/sometext9']
I want to convert above list into a form like below with unique numbers specific to every level. So that there will be proper differentiation between 1's in ('1/', '1/1/','1/1/1/') and 2's in ('1/1/2/','1/2/','1/2/1/','1/2/2/','2/').
xd = ['123/sometext1',
'123/1234/sometext2',
'123/1234/12345/sometext3',
'123/1234/234/sometext4',
'123/2345/sometext5',
'123/2345/123456/sometext6',
'123/2345/23456/sometext7',
'234567/sometext8',
'3456/sometext9']
The unique values are just for example and can be any unique strings.

This example will add depth number to each level:
import re
xo = [
"1/sometext1",
"1/1/sometext2",
"1/1/1/sometext3",
"1/1/2/sometext4",
"1/2/sometext5",
"1/2/1/sometext6",
"1/2/2/sometext7",
"2/sometext8",
"3/sometext9",
]
pat = re.compile(r"((?:\d+/)+)(.*)")
out = []
for s in xo:
nums, rest = pat.match(s).groups()
nums = "/".join(f"{i}-{n}" for i, n in enumerate(nums.split("/"), 1) if n)
out.append(nums + "/" + rest)
print(out)
Prints:
[
"1-1/sometext1",
"1-1/2-1/sometext2",
"1-1/2-1/3-1/sometext3",
"1-1/2-1/3-2/sometext4",
"1-1/2-2/sometext5",
"1-1/2-2/3-1/sometext6",
"1-1/2-2/3-2/sometext7",
"1-2/sometext8",
"1-3/sometext9",
]
EDIT: Modified example:
import re
xo = [
"1/sometext1",
"1/1/sometext2",
"1/1/1/sometext3",
"1/1/2/sometext4",
"1/2/sometext5",
"1/2/1/sometext6",
"1/2/2/sometext7",
"2/sometext8",
"3/sometext9",
]
pat = re.compile(r"((?:\d+/)+)(.*)")
out = []
for s in xo:
nums, rest = pat.match(s).groups()
tmp = [n for n in nums.split("/") if n]
nums = "/".join(f"{'.'.join(tmp[:i])}" for i in range(1, len(tmp) + 1))
out.append(nums + "/" + rest)
print(out)
Prints:
[
"1/sometext1",
"1/1.1/sometext2",
"1/1.1/1.1.1/sometext3",
"1/1.1/1.1.2/sometext4",
"1/1.2/sometext5",
"1/1.2/1.2.1/sometext6",
"1/1.2/1.2.2/sometext7",
"2/sometext8",
"3/sometext9",
]

This code below will, for every path component, generate a unique corresponding number for that specific value:
from collections import defaultdict
import random, string
class UniquePaths:
def __init__(self):
self.paths = []
def new_path(self):
while (p:=''.join(random.choice(string.digits) for _ in range(random.randint(3, 10)))) in self.paths:
pass
self.paths.append(p)
return p
def build_results(self, d, new_p = []):
_d = defaultdict(list)
for i in d:
if len(i) == 1:
yield '/'.join(new_p)+'/'+i[0]
else:
_d[i[0]].append([*i[1:-1], i[-1]])
yield from [j for b in _d.values() for j in self.build_results(b, new_p+[self.new_path()])]
#classmethod
def to_unique(cls, paths):
return list(cls().build_results([i.split('/') for i in paths]))
xo = ['1/sometext1', '1/1/sometext2', '1/1/1/sometext3', '1/1/2/sometext4', '1/2/sometext5', '1/2/1/sometext6', '1/2/2/sometext7', '2/sometext8', '3/sometext9']
new_paths = UniquePaths.to_unique(xo)
Output:
['987498/sometext1',
'987498/3886405008/sometext2',
'987498/3886405008/4380239/sometext3',
'987498/3886405008/0407507/sometext4',
'987498/984618899/sometext5',
'987498/984618899/89110/sometext6',
'987498/984618899/45767633/sometext7',
'50264/sometext8',
'768/sometext9']
The solution above does not base the unique value generation on the original component values themselves, thus removing any possibility of producing a non unique path component, and instead randomly generates strings of varying lengths.

Related

Merging overlapping string sequences in a list

I am trying to figure out how to merge overlapping strings in a list together, for example for
['aacc','accb','ccbe']
I would get
['aaccbe']
This following code works for the example above, however it does not provide me with the desired result in the following case:
s = ['TGT','GTT','TTC','TCC','CCC','CCT','CCT','CTG','TGA','GAA','AAG','AGC','GCG','CGT','TGC','GCT','CTC','TCT','CTT','TTT','TTT','TTC','TCA','CAT','ATG','TGG','GGA','GAT','ATC','TCT','CTA','TAT','ATG','TGA','GAT','ATT','TTC']
a = s[0]
b = s[-1]
final_s = a[:a.index(b[0])]+b
print(final_s)
>>>TTC
My output is clearly not right, and I don't know why it doesn't work in this case. Note that I have already organized the list with the overlapping strings next to each other.

You can use a trie to storing the running substrings and more efficiently determine overlap. When the possibility of an overlap occurs (i.e for an input string, there exists a string in the trie with a letter that starts or ends the input string), a breadth-first search to find the largest possible overlap takes place, and then the remaining bits of string are added to the trie:
from collections import deque
#trie node (which stores a single letter) class definition
class Node:
def __init__(self, e, p = None):
self.e, self.p, self.c = e, p, []
def add_s(self, s):
if s:
self.c.append(self.__class__(s[0], self).add_s(s[1:]))
return self
class Trie:
def __init__(self):
self.c = []
def last_node(self, n):
return n if not n.c else self.last_node(n.c[0])
def get_s(self, c, ls):
#for an input string, find a letter in the trie that the string starts or ends with.
for i in c:
if i.e in ls:
yield i
yield from self.get_s(i.c, ls)
def add_string(self, s):
q, d = deque([j for i in self.get_s(self.c, (s[0], s[-1])) for j in [(s, i, 0), (s, i, -1)]]), []
while q:
if (w:=q.popleft())[1] is None:
d.append((w[0] if not w[0] else w[0][1:], w[2], w[-1]))
elif w[0] and w[1].e == w[0][w[-1]]:
if not w[-1]:
if not w[1].c:
d.append((w[0][1:], w[1], w[-1]))
else:
q.extend([(w[0][1:], i, 0) for i in w[1].c])
else:
q.append((w[0][:-1], w[1].p, w[1], -1))
if not (d:={a:b for a, *b in d}):
self.c.append(Node(s[0]).add_s(s[1:]))
elif (m:=min(d, key=len)):
if not d[m][-1]:
d[m][0].add_s(m)
else:
t = Node(m[0]).add_s(m)
d[m][0].p = self.last_node(t)
Putting it all together
t = Trie()
for i in ['aacc','accb','ccbe']:
t.add_string(i)
def overlaps(trie, c = ''):
if not trie.c:
yield c+trie.e
else:
yield from [j for k in trie.c for j in overlaps(k, c+trie.e)]
r = [j for k in t.c for j in overlaps(k)]
Output:
['aaccbe']

Use difflib.find_longest_match to find the overlap and concatenate appropriately, then use reduce to apply the entire list.
import difflib
from functools import reduce
def overlap(s1, s2):
# https://stackoverflow.com/a/14128905/4001592
s = difflib.SequenceMatcher(None, s1, s2)
pos_a, pos_b, size = s.find_longest_match(0, len(s1), 0, len(s2))
return s1[:pos_a] + s2[pos_b:]
s = ['aacc','accb','ccbe']
result = reduce(overlap, s, "")
print(result)
Output
aaccbe

Filtering a list of tuple data

I have a list like below.
list1 = [
('Ram','Laxman','Bharat','Sita'),
('Ram','Ravan','Bharat','Sita'),
('Ram','Luv','Dashrat','Sita'),
('Dasrath','Kekei','Bharat','Ram'),
('Laxman','Bharat','Ram','Hanuman'),
('Hanuman','Sita','Kekei','Ravan'),
('Ram','Sita','Hanuman','Ravan')
]
I want to filter the list data which has at least 3 tuple values matching , If 2 or more tuples have at least 3 value matching then only 1st tuple should be there in the list along with rest.
For example in above list we have below list tuple which has 3 value matching.
result = [
('Ram','Laxman','Bharat','Sita'),
('Ram','Luv','Dashrat','Sita'),
('Dasrath','Kekei','Bharat','Ram'),
('Hanuman','Sita','Kekei','Ravan')
]

from operator import add
from functools import reduce
def solve(xss):
mems = [xss[0]]
for xs in xss[1:]:
if len(set(reduce(add,mems)).intersection(set(xs))) < 3:
mems = mems + [xs]
return mems

I think your question is interesting. I use some matrix operation to solve it. This way I think will much faster than common loop.
from collections import OrderedDict
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
class DuplicateFinder(object):
def __init__(self, raw_list):
self.raw_list = raw_list
self.multi_label_encoder = MultiLabelBinarizer()
self.final_results = OrderedDict()
self.title_select = []
self.content_select = []
self.not_select = []
#property
def match_result(self):
label_matrix = self.multi_label_encoder.fit_transform(self.raw_list)
return np.dot(label_matrix, label_matrix.T)
#property
def raw_results(self):
return np.array(np.where(self.match_result >= 3)).T
def solve(self):
for result in self.raw_results:
if result[0] == result[1]:
continue
if result[0] in self.content_select:
continue
if result[0] not in self.final_results:
self.final_results[result[0]] = []
self.final_results[result[0]].append(result[1])
self.title_select.append(result[0])
self.content_select.append(result[1])
elif result[1] not in self.content_select + self.title_select:
self.final_results[result[0]].append(result[1])
self.content_select.append(result[1])
else:
continue
self.not_select = list(set(range(self.match_result.shape[0])) - set(
self.title_select + self.content_select
))
def print_result(self):
print(f"This is more than one matched: {self.final_results}")
for key in self.final_results:
print(self.raw_list[key])
print(f"This is just one: {self.not_select}")
for key in self.not_select:
print(self.raw_list[key])
list1 = [
("Ram", "Laxman", "Bharat", "Sita"),
("Ram", "Ravan", "Bharat", "Sita"),
("Ram", "Luv", "Dashrat", "Sita"),
("Dasrath", "Kekei", "Bharat", "Ram"),
("Laxman", "Bharat", "Ram", "Hanuman"),
("Hanuman", "Sita", "Kekei", "Ravan"),
("Ram", "Sita", "Hanuman", "Ravan"),
]
solver = DuplicateFinder(list1)
solver.solve()
solver.print_result()
This is the result:
This is more than one matched: OrderedDict([(0, [1, 4]), (5, [6])])
('Ram', 'Laxman', 'Bharat', 'Sita')
('Hanuman', 'Sita', 'Kekei', 'Ravan')
This is just one: [2, 3]
('Ram', 'Luv', 'Dashrat', 'Sita')
('Dasrath', 'Kekei', 'Bharat', 'Ram')

Counting strings in lists and then filtering & matching, in python

I have a list of words, and with python3 I count the difference in letters between each combination of words (using a clever diff_summing algorithm from this site):
import itertools
def diff_letters(a,b):
return sum ( a[i] != b[i] for i in range(len(a)) )
w = ['AAHS','AALS','DAHS','XYZA']
for x,y in itertools.combinations(w,2):
if diff_letters(x,y) == 1:
print(x,y)
This prints:
AAHS AALS
AAHS DAHS
My question: How can I count and record that strings 'DAHS' and 'AALS' have exactly one partner, and 'AAHS' has two partners? I'll be filtering for directional combinations where each target_string has exactly one near_matching_word, so my final data would (as a JSON) look like this:
[
{
"target_word": "DAHS",
"near_matching_word": "AAHS"
},
{
"target_word": "AALS",
"near_matching_word": "AAHS"
}
]
(noticing that AAHS doesn't appear as a target_word)
I have one version using functools.reduce
import itertools
import functools
import operator
def diff_letters(a,b):
return sum ( a[i] != b[i] for i in range(len(a)) )
w = ['AAHS','AALS','DAHS','XYZA']
pairs = []
for x,y in itertools.combinations(w,2):
if diff_letters(x,y) == 1:
#print(x,y)
pairs.append((x,y))
full_list = functools.reduce(operator.add, pairs)
for x in full_list:
if full_list.count(x) == 1:
print (x)
which prints
AALS
DAHS
but then I would have to go back to my big list pairs to find the near_matching_word. Of course, in my final version, list pairs will be much larger, and the target_word could be either the 1st or 2nd item in the tuple (x,y).

The other answers keep all pairs even when more than one is found. Since they are not needed, that seems to waste memory. This answer only keeps at most one pair for each string.
import collections
import itertools
def diff_letters(a,b):
return sum ( a[i] != b[i] for i in range(len(a)) )
w = ['AAHS','AALS','DAHS','XYZA']
# Marker for pairs that have not been found yet.
NOT_FOUND = object()
# Collection of found pairs x => y. Each item is in one of three states:
# - y is NOT_FOUND if x has not been seen yet
# - y is a string if it is the only accepted pair for x
# - y is None if there is more than one accepted pair for x
pairs = collections.defaultdict(lambda: NOT_FOUND)
for x,y in itertools.combinations(w,2):
if diff_letters(x,y) == 1:
if pairs[x] is NOT_FOUND:
pairs[x] = y
else:
pairs[x] = None
if pairs[y] is NOT_FOUND:
pairs[y] = x
else:
pairs[y] = None
# Remove None's and change into normal dict.
pairs = {x: y for x, y in pairs.items() if y}
for x, y in pairs.items():
print("Target = {}, Only near matching word = {}".format(x, y))
Output:
Target = AALS, Only near matching word = AAHS
Target = DAHS, Only near matching word = AAHS

You could use a dictionary instead of a list of pairs:
pairs = {}
for x, y in itertools.combinations(w, 2):
if diff_letters(x, y) == 1:
pairs.setdefault(x, []).append(y)
pairs.setdefault(y, []).append(x)
result = [{ "target_word": key, "near_matching_word": head, } for key, (head, *tail) in pairs.items() if not tail]
print(result)
Output
[{'target_word': 'AALS', 'near_matching_word': 'AAHS'}, {'target_word': 'DAHS', 'near_matching_word': 'AAHS'}]
In the pairs dictionary the keys are the target_words and the values are the near_matching_words. Then use a list comprehension to filter out those that have more that 1 near_matching_word.

import itertools
import functools
import operator
def diff_letters(a, b):
return sum(a[i] != b[i] for i in range(len(a)))
w = ['AAHS', 'AALS', 'DAHS', 'XYZA']
pairs = []
for x, y in itertools.combinations(w, 2):
if diff_letters(x, y) == 1:
pairs.append((x, y))
full_list = functools.reduce(operator.add, pairs)
result = []
for x in set(full_list):
if full_list.count(x) == 1:
pair = next((i for i in pairs if x in i))
match = [i for i in pair if i != x][0]
result.append({
"target_word": x,
"near_matching_word": match
})
print(result)
Outputs:
[{'target_word': 'DAHS', 'near_matching_word': 'AAHS'}, {'target_word': 'AALS', 'near_matching_word': 'AAHS'}]

Find values in list which differ from reference list by up to N characters

I have a list like the following:
Test = ['ASDFGH', 'QWERTYU', 'ZXCVB']
And a reference list like this:
Ref = ['ASDFGY', 'QWERTYI', 'ZXCAA']
I want to extract the values from Test if they are N or less characters different from any one of the items in Ref.
For example, if N = 1, only the first two elements of Test should be output. If N = 2, all three elements fit this criteria and should be returned.
It should be noted that I am looking for same charcacter length values (ASDFGY -> ASDFG matching doesn't work for N = 1), so I want something more efficient than levensthein distance.
I have over 1000 values in ref and a couple hundred million in Test so efficiency is key.

Using a generation expression with sum:
Test = ['ASDFGH', 'QWERTYU', 'ZXCVB']
Ref = ['ASDFGY', 'QWERTYI', 'ZXCAA']
from collections import Counter
def comparer(x, y, n):
return (len(x) == len(y)) and (sum(i != j for i, j in zip(x, y)) <= n)
res = [a for a, b in zip(Ref, Test) if comparer(a, b, 1)]
print(res)
['ASDFGY', 'QWERTYI']

Using difflib
Demo:
import difflib
N = 1
Test = ['ASDFGH', 'QWERTYU', 'ZXCVB']
Ref = ['ASDFGY', 'QWERTYI', 'ZXCAA']
result = []
for i,v in zip(Test, Ref):
c = 0
for j,s in enumerate(difflib.ndiff(i, v)):
if s.startswith("-"):
c += 1
if c <= N:
result.append( i )
print(result)
Output:
['ASDFGH', 'QWERTYU']

The newer regex module offers a "fuzzy" match possibility:
import regex as re
Test = ['ASDFGH', 'QWERTYU', 'ZXCVB']
Ref = ['ASDFGY', 'QWERTYI', 'ZXCAA', 'ASDFGI', 'ASDFGX']
for item in Test:
rx = re.compile('(' + item + '){s<=3}')
for r in Ref:
if rx.search(r):
print(rf'{item} is similar to {r}')
This yields
ASDFGH is similar to ASDFGY
ASDFGH is similar to ASDFGI
ASDFGH is similar to ASDFGX
QWERTYU is similar to QWERTYI
ZXCVB is similar to ZXCAA
You can control it via the {s<=3} part which allows three or less substitutions.
To have pairs, you could write
pairs = [(origin, difference)
for origin in Test
for rx in [re.compile(rf"({origin}){{s<=3}}")]
for difference in Ref
if rx.search(difference)]
Which would yield for
Test = ['ASDFGH', 'QWERTYU', 'ZXCVB']
Ref = ['ASDFGY', 'QWERTYI', 'ZXCAA', 'ASDFGI', 'ASDFGX']
the following output:
[('ASDFGH', 'ASDFGY'), ('ASDFGH', 'ASDFGI'),
('ASDFGH', 'ASDFGX'), ('QWERTYU', 'QWERTYI'),
('ZXCVB', 'ZXCAA')]

Simplifying Vigenere cipher program in Python

I have the program below, which is passed on to another function which simply prints out the original and encrypted messages. I want to know how I can simplify this program, specifically the "match = zip" and "change = (reduce(lambda" lines. If possible to do this without using lambda, how can I?
from itertools import cycle
alphabet = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"]
def vigenereencrypt(message,keyword):
output = ""
match = zip(message.lower(),cycle(keyword.lower()))
for i in match:
change = (reduce(lambda x, y: alphabet.index(x) + alphabet.index(y), i)) % 26
output = output + alphabet[change]
return output.lower()

Two things:
You dont need to have a local variable match, just loop zip
Your can split up your two indices x and y in your for loop definition rather than using reduce; reduce is normally used for larger iterables and since you only have 2 items in i, it's adding unnecessary complexity.
ie, you can change your for loop definition to:
for x, y in zip(...):
and your definition of change to:
change = (alphabet.index(x) + alphabet.index(y)) % 26

Starting with what R Nar said:
def vigenereencrypt(message,keyword):
output = ""
for x, y in zip(message.lower(), cycle(keyword.lower())):
change = (alphabet.index(x) + alphabet.index(y)) % 26
output = output + alphabet[change]
return output.lower()
We can be more efficient by using a list and then joining it, instead of adding to a string, and also by noticing that the output is already lowercase:
def vigenereencrypt(message,keyword):
output = []
for x, y in zip(message.lower(), cycle(keyword.lower())):
change = (alphabet.index(x) + alphabet.index(y)) % 26
output.append(alphabet[change])
return "".join(output)
Then we can reduce the body of the loop to one line..
def vigenereencrypt(message,keyword):
output = []
for x, y in zip(message.lower(), cycle(keyword.lower())):
output.append(alphabet[(alphabet.index(x) + alphabet.index(y)) % 26])
return "".join(output)
... so we can turn it into a list comprehension:
def vigenereencrypt(message,keyword):
output = (
alphabet[(alphabet.index(x) + alphabet.index(y)) % 26]
for x, y in zip(message.lower(), cycle(keyword.lower()))
)
return "".join(output)
I feel like there's something we could do with map(alphabet.index, ...) but I can't think of a way that's any better than the list comprehension.

you could do it with a bunch of indexing instead of zip...
alphabet = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"]
alphaSort = {k:n for n,k in enumerate(alphabet)}
alphaDex = {n:k for n,k in enumerate(alphabet)}
def vigenereencrypt(message,keyword):
output = ""
#match = zip(message.lower(),cycle(keyword.lower())) # zip(a,cycle(b)) Creates [(a[n],b[n%len(b)]) for k in range(len(a)) ]
op = "" # So lets start with for k in range(len(a))
for k in range(len(message)):
op += alphaDex[(alphaSort[message.lower()[k]]+alphaSort[keyword.lower()[k%len(keyword)]])%len(alphabet)]
return(op)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Substitute list of paths with unique elements at every level (/) - python

Related

Merging overlapping string sequences in a list

Filtering a list of tuple data

Counting strings in lists and then filtering & matching, in python

Find values in list which differ from reference list by up to N characters

Simplifying Vigenere cipher program in Python

Categories

Resources