How to judge if a string contains a given substring (have gap) - python

e.g.
a = 'abc123def'
b = 'abcdef'
I want a function which can judge whether b in a.
contains(a,b)=True
p.s. gap is also allowed in the represention of b, e.g.
b='abc_def'
but regular expressions are not allowed.

If what you want to do is to check whether b is a subsequence of a, you can write:
def contains(a, b):
n, m = len(a), len(b)
j = 0
for i in range(n):
if j < m and a[i] == b[j]:
j += 1
return j == m

Try using list comprehension:
def contains(main_string, sub_string):
return all([i in main_string for i in sub_string])
NOTE: 'all' is a builtin function which takes an iterable of booleans and returns try if all are True.

def new_contained(a,b):
boo = False
c = [c for c in a]
d = [i for i in b]
if len(c)<=len(d):
for i in c:
if i in d:
boo = True
return boo

Related

How to tell if one generator is a subsequence or a prefix of another generator?

I have two generators, A and B, of unknown length.
I want to know if B is a subsequence (contiguous) of A, so I do the following:
def subseq(A, B):
b0 = next(B)
for a in A:
if a == b0:
break
else: # no-break
# b0 not found in A so B is definitely not a subseq of A
return False
# is the remaining of B a prefix of the remaining of B
return prefix(A, B)
def prefix(A, B):
return all(a == b for a, b in zip(A, B))
However, prefix(A, B) is not exactly correct, as if what remains of A is shorter than what remains of B, then I might get a false positive:
E.g. with A = 'abc' and B = 'abcd' (imagine they are generators), then return all(a == b for a, b in zip(A, B)) would return True.
But if I use zip_longest instead, then I have the complimentary problem -- I would get false negatives:
E.g. with A = 'abcd' and B = 'abc', then return all(a == b for a, b in zip_longest(A, B)) would return False.
What's a sensible way to do this? Specifically, I want to zip to the length of the second argument. I basically want something like zip_(A, B, ziplengthselect=1)
where ziplengthselect=i tells the function that it should zip to the length of the ith argument.
Then the expression all(a == b for a, b in zip_(A, B, fillvalue=sentinel, ziplengthselect=1)) where sentinel is something not found in B, would have the following behavior. If the expression
reaches end of B, then it would evaluate to True
reaches end of A, then it would use the fillvalue, check sentinel == b, fail the check since sentinel was chosen to be something not found in B, and return False
fails an a == b check, then it would evaluate to False
I can think of solutions with try, except blocks, but was wondering if there's a better way.
# Whether generator B is a prefix of generator A.
def prefix(A, B):
for b in B:
try:
a = next(A)
if a != b:
return False
except StopIteration:
# reached end of A
return False
return True
OR
# Whether generator B is a prefix of generator A.
def prefix(A, B):
prefix = all(a == b for a, b in zip(A, B))
if not prefix:
return False
try:
next(B)
# end of B was reached
return True
except StopIteration:
# end of B was not reached
return False
The above code works when A has no duplicates. However if A has duplicates, then we have to tee the generators as follows:
from itertools import tee
def subseq(A, B):
try:
b0 = next(B)
except StopIteration:
return True
while True:
try:
a = next(A)
if a == b0:
A, Acop = tee(A)
B, Bcop = tee(B)
if prefix(Acop, Bcop):
return True
del Acop, Bcop
except StopIteration:
return False
def prefix(A, B):
for b in B:
try:
a = next(A)
if a != b:
return False
except StopIteration:
# reached end of A
return False
return True
# Some tests
A = (i for i in range(10))
B = (i for i in range(5,8))
print(subseq(A, B)) # True
A = (i for i in range(10))
B = (i for i in range(5,11))
print(subseq(A, B)) # False
A = (i for i in [1,2,3]*10 + [1,2,3,4])
B = (i for i in [1,2,3])
print(subseq(A, B)) # True
A = (i for i in [1,1,2,1,1,2]*8 + [3])
B = (i for i in [1,1,2,3])
print(subseq(A, B)) # True
Here's how I solved the analogous subsequence problem for lists. Lists are easier because you can know their length:
def isSublist(lst, sublst):
N, M = len(lst), len(sublst)
starts = (i for i in range(N - M + 1) if lst[i] == sublst[0])
for i in starts:
# i <= N - M so N - i >= M
j = 0
while j < M and lst[i] == sublst[j]:
i += 1
j += 1
if j == M:
return True
return False
I might use deques (although this assumes B is finite):
from collections import deque
from itertools import islice
def subseq(A, B):
B = deque(B)
if not B:
return True
n = len(B)
Asub = deque(islice(A, n-1), n)
for a in A:
Asub.append(a)
if Asub == B:
return True
return False
Might take more or less time/memory than yours. Depends on the input.
Try it online!
A note about yours: For an input like A = iter('a'+'b'*10**7), B = iter('ac') you waste a lot of memory (90 MB on 64-bit Python), since your Acop from the very beginning causes the underlying tee storage to never let go of anything. You'd better do del Acop, Bcop after an unsuccessful prefix check.
It’s possible to build KMP’s partial match table lazily.
from itertools import islice
def has_substring(sup, sub):
sub = LazySequence(sub)
if not sub:
return True
t = kmp_table(sub)
k = 0
for x in sup:
while x != sub[k]:
k = t[k]
if k == -1:
break
if k == -1:
k = 0
continue
k += 1
try:
sub[k]
except IndexError:
return True
return False
class LazySequence:
def __init__(self, iterator):
self.consumed = []
self.iterator = None if iterator is None else iter(iterator)
def __getitem__(self, index):
if index >= len(self.consumed):
self.consumed.extend(islice(self.iterator, index - len(self.consumed) + 1))
return self.consumed[index]
def __iter__(self):
consumed = self.consumed
yield from consumed
for x in self.iterator:
consumed.append(x)
yield x
def __bool__(self):
for _ in self:
return True
return False
def lazy_sequence(g):
def wrap_generator(*args, **kwargs):
ls = LazySequence(None)
ls.iterator = g(ls.consumed, *args, **kwargs)
return ls
return wrap_generator
#lazy_sequence
def kmp_table(t, w):
yield -1
cnd = 0
for x in islice(w, 1, None):
if x == w[cnd]:
yield t[cnd]
else:
yield cnd
while cnd != -1 and x != w[cnd]:
cnd = t[cnd]
cnd += 1
This search is fast (asymptotically optimal time of O(|sub| + |sup|)) and doesn’t use unnecessary time/space when one generator is much longer than the other – including being able to return True when sup is infinite and being able to return False when sub is infinite.

Comparing elements in two lists in python

I have a function that compares the elements of two lists and returns the difference between them. I have two versions of it. The first one works but not the second one. What is wrong with the second function? The inputs a and b are two lists of same length.
def compareLists(a, b):
A = sum([1 if i > j else 0 for i, j in zip(a, b)])
B = sum([1 if j > i else 0 for i, j in zip(a, b)])
return (A, B)
def compareLists(a, b):
A = sum([1 for i in range(0, len(a)) if a[i] > b[i] else 0])
B = sum([1 for i in range(0, len(a)) if b[i] > a[i] else 0])
return (A, B)
Eg input and output: a = [1, 2, 3,4]; b = [0, -2, 5, 6]; output = (2, 2)
You don't need the ternary operator (if-else) in the second code since using the if expression in a list comprehension is how the output can be filtered:
A = sum([1 for i in range(0, len(a)) if a[i] > b[i]])
B = sum([1 for i in range(0, len(a)) if b[i] > a[i]])
Adding else as you do in your second code makes the syntax invalid.
For completeness, as #wim noted in the comment, the use of the ternary operator is unnecessary in your first code either because Boolean values in Python are simply integers of 1 and 0, so you can output the Boolean values returned by the comparison operators directly instead:
A = sum([i > j for i, j in zip(a, b)])
B = sum([j > i for i, j in zip(a, b)])

Finding missing elements in a List

Hello I have a List with a lot of element in it. These are numbers and ordered but some numbers are missing.
Example: L =[1,2,3,4,6,7,10]
Missing: M = [5,8,9]
How can I find missing numbers in Python?
Take the difference between the sets:
set(range(min(L),max(L))) - set(L)
If you are really crunched for time and L is truly sorted, then
set(range(L[0], L[-1])) - set(L)
This function should do the trick
def missing_elements(L):
s, e = L[0], L[-1]
return sorted(set(range(s, e + 1)).difference(L))
miss = missing_elements(L)
Here you are:
L =[1,2,3,4,6,7,10]
M = [i for i in range(1, max(L)) if i not in L]
# If 0 shall be included replace range(1, max(L)) to range(max(L))
With a comprehension it would look like this:
L = [1,2,3,4,6,7,10]
M = [i for i in range(min(L), max(L)+1) if i not in L]
M
#[5,8,9]
And a fun one, just to add to the bunch:
[i for a, b in zip(L, L[1:]) for i in range(a + 1, b) if b - a > 1]
L =[1,2,3,4,6,7,10]
R = range(1, max(L) + 1)
> [1,2,3,4,5,6,7,8,9,10]
M = list(set(R) - set(L))
> [5,8,9]
Note that M will not necessarily be ordered, but can easily be sorted.

Subtraction between two nested lists of strings in Python

I am trying to follow the used structure in this question for nested lists but I'm confused and don't know how to figure it out. Suppose that to subtract the two lists a = ['5', '35.1', 'FFD'] and b = ['8.5', '11.3', 'AMM'], the following code is used for reaching to equation c = b - a:
diffs = []
for i, j in zip(a, b):
try:
diffs.append(str(float(j) - float(i)))
except ValueError:
diffs.append('-'.join([j, i]))
>>> print(diffs)
['3.5', '-23.8', 'AMM-FFD']
My question is, how do I get C = B - A by considering the following structure:
A = [['X1','X2'],['52.3','119.4'],['45.1','111']]
B = [['Y1','Y2'],['66.9','65'],['99','115.5']]
C = [['Y1-X1','Y2-X2'],['14.6','-54.4'],['53.9','4.5']]
and how do I the first and second elements of each internal list, something like:
Array 1 = ['Y1-X1', '14.6', '53.9']
Array 2 = ['Y2-X2', '-54.4', '4.5']
I appreciate any kind of help.
Well, if it's guaranteed that the lists will always be 2 levels nested, you can simply add one more loop:
diffs_lists = []
for i, j in zip(a, b):
diffs = []
for k, l in zip(i, j):
try:
diffs.append(str(float(k) - float(l)))
except ValueError:
diffs.append('-'.join([k, l]))
diffs_lists.append(diffs)
To separate the result in two as you asked, simply use zip:
zip(*diffs_lists)
You just need another level of looping:
res = []
for a, b in zip(A, B):
diffs = []
res.append(diffs)
for i, j in zip(a, b):
try:
diffs.append(str(float(j) - float(i)))
except ValueError:
diffs.append('-'.join([j, i]))
print(res)
#[['Y1-X1', 'Y2-X2'], ['14.600000000000009', '-54.400000000000006'], ['53.9', '4.5']]
print(list(zip(*res)))
#[('Y1-X1', '14.600000000000009', '53.9'), ('Y2-X2', '-54.400000000000006', '4.5')]
diffs=[]
for sub_b, sub_a in zip(b, a):
curr = []
for atom_b, atom_a in zip(sub_b, sub_a):
try:
curr.append(float(atom_b) - float(atom_a))
except ValueError:
curr.append('-'.join([atom_b, atom_a]))
diffs.append(curr)
ans1, ans2 = zip(*diffs)
The zip function can also be used to unzip iterables.
Suppose you have a list_diffs function, that is basically the code you provided:
list_diffs(a, b):
diffs = []
for i, j in zip(a, b):
try:
diffs.append(str(float(j) - float(i)))
except ValueError:
diffs.append('-'.join([j, i]))
return diffs
Then, the C you want is just a list whose elements are diffs between elements of A and elements of B. So the following gives you C:
C = []
for i in range(len(A)):
C.append(list_diffs(A[i], B[i]))
To get the lists of the first and of the second elements:
array1 = [c[0] for c in C]
array2 = [c[1] for c in C]
In case you need this to work with arbitrary amount of nesting you could use recursion:
def subtract(x, y):
diffs = []
for a, b in zip(x, y):
try:
if isinstance(a, list):
diffs.append(subtract(a, b))
else:
diffs.append(str(float(b) - float(a)))
except ValueError:
diffs.append('-'.join([b, a]))
return diffs
As others have pointed out zip can be used for unzipping:
res = subtract(A, B)
t1, t2 = zip(*res)
print(t1)
print(t2)
Output:
('Y1-X1', '14.6', '53.9')
('Y2-X2', '-54.4', '4.5')
i try it with a recursive method
A = [['X1','X2'],['52.3','119.4'],['45.1','111']]
B = [['Y1','Y2'],['66.9','65'],['99','115.5']]
C = [['Y1-X1','Y2-X2'],['14.6','-54.4'],['53.9','4.5']]
Array_a,Array_b = [[] for __ in range(2)]
def diff(B,A):
_a = 0
for b,a in zip(B,A):
if isinstance(b,list):
diff(b,a)
else:
try:
Array_b.append(float(b)-float(a)) if _a else Array_a.append(float(b)-float(a))
_a = True
except (ValueError,TypeError) as e:
Array_b.append("{0}-{1}".format(b,a)) if _a else Array_a.append("{0}-{1}".format(b,a))
_a = True
return (Array_a,Array_b)
print (diff(B,A))
>>>(['Y1-X1', 14.600000000000009, 53.9], ['Y2-X2', -54.400000000000006, 4.5])

Can someone explain how this if statement is working?

Can someone explain how the if statement is working or the meaning of the code in the following code?
I have two lists, A and B, and I need to see if there exists a pair of elements, one from A the other from B, such that swapping them will make the sum of both lists equal.
My method, O(n^2) is to find the sumOfA and sumOfB.
Find the halfdiff = (sumOfA - sumOfB)/2
For each element in A, see if there's a B[i] so that (A[j] - B[i]) = halfdiff.
But the following code does it in O(n+m). And I don't understand the meaning of "if" statement (LINE 11) here. Does it guarantee that if it is true we have the required pair?
1 def fast_solution(A, B, m):
2 n = len(A)
3 sum_a = sum(A)
4 sum_b = sum(B)
5 d = sum_b - sum_a
6 if d % 2 == 1:
7 return False
8 d //= 2
9 count = counting(A, m)
10 for i in xrange(n):
11 if 0 <= B[i] - d and B[i] - d <= m and count[B[i] - d] > 0:
12 return True
13 return False
You have to find i, j such that sum(A) - a[i] + b[j] = sum(B) - b[j] + a[i], or equivalently, sum(A) - 2*a[i] = sum(B) - 2*b[j].
You can do this by calculating all possible results of the right-hand-side, and then searching through possible i values.
def exists_swap(A, B):
sumA = sum(A)
sumB = sum(B)
bVals = set(sumB - 2 * bj for bj in B)
return any(sumA - 2 * ai in bVals for ai in A)
The partial code in your question is doing a similar thing, except d = (sum(B)-sum(A))/2 and count is itertools.Counter(A) (that is, it's a dict that maps any x to the number of times it appears in A). Then count[B[i] - d] > 0 is equivalent to there being a j such that B[i] - d = A[j], or B[i] - A[j] = (sum(B) - sum(A))/2.
It may be that instead of using sets or dicts, the value m is the maximum value allowed in A and B. Then counting could be defined like this:
def counting(xs, m):
r = [0] * (m+1)
for x in xs:
r[x] += 1
return r
This is a simple but inefficient way to represent a set of integers, but it makes sense of the missing parts of your question and explains the bounds checking 0 <= B[i] - d and B[i] - d <= m which is unnecessary if you use a set or dict, but necessary if counting returns an array.
Actually, it's not O(n+m). Linear estimation is just amortized because of hashmap count usage. This knowledge may help you to understand that your code is an obfuscated version of
bool solve(A,B) {
sum_a = sum(A)
sum_b = sum(B)
sort(B)
for(val in A)
if( binary_search(B, val - (sum_b - sum_a)/2 ) )
return true
return false
}
As Paul pointed out, 0 <= B[i] - d and B[i] - d <= m is just a validation of count argument. BTW his solution is purely linear, well implemented and much simplier to understand.

Categories

Resources