fill in list in multiple steps - python

Lets assume you have a list with y poisitions (0 for sake of this question). If y = 10:
[0,0,0,0,0,0,0,0,0,0]
You want to fill adjacent positions up to a given value x and append it to an empty list. If x = 4:
[[1,1,1,1,0,0,0,0,0,0], [0,1,1,1,1,0,0,0,0,0], [0,0,1,1,1,1,0,0,0,0], ... , [0,0,0,0,0,0,1,1,1,1]]
I made that occur through this function:
def permutations(number=4, limit=10):
perms = []
if type(number) == int:
a = -1
b = a + number
while b < limit:
a+=1
b = a + number
start = [0 for x in range(limit)]
for i in range(a, b):
start[i] = 1
perms.append(start)
This is fine, but if I want to do the same thing, but pass a tuple instead of an integer I'd like the output to be:
if number = (4,3):
[[1,1,1,1,0,1,1,1,0,0], [1,1,1,1,0,0,1,1,1,0], [1,1,1,1,0,0,0,1,1,1],
[0,1,1,1,1,0,1,1,1,0], [0,1,1,1,1,0,0,1,1,1],
[0,0,1,1,1,1,0,1,1,1]]
The 0 between the two groupings of 1's is necessary the first value of the tuple corresponds to the number of 1's in the first grouping, and the second value of the tuple corresponds to the number of 1's in the second grouping. Ideally this function would work with tuples that have more than 2 values.
This idea is a little challenging to get across so please let me know if you need any clarification.
Thank you for your help!

The simplest approach I can think of is to generate all possible combinations of 1 and 0, and filter out all of the ones that don't have the right grouping lengths.
import itertools
def permutations(tup, limit=10):
for candidate in itertools.product([0,1], repeat=limit):
segment_lengths = [len(list(b)) for a,b in itertools.groupby(candidate) if a == 1]
if tup == tuple(segment_lengths):
yield candidate
for seq in permutations((4, 3), 10):
print seq
Result:
(0, 0, 1, 1, 1, 1, 0, 1, 1, 1)
(0, 1, 1, 1, 1, 0, 0, 1, 1, 1)
(0, 1, 1, 1, 1, 0, 1, 1, 1, 0)
(1, 1, 1, 1, 0, 0, 0, 1, 1, 1)
(1, 1, 1, 1, 0, 0, 1, 1, 1, 0)
(1, 1, 1, 1, 0, 1, 1, 1, 0, 0)
Note that this is very slow for large values of limit - it has to evaluate 2^limit candidate sequences. Not bad for limit = 10; only 1024 candidates need to be evaluated. But it quickly grows into the millions and beyond for larger limits.
Edit: Inspired by user2097159's excellent comment, here's an approach with better run time.
import itertools
"""Finds all non-negative integer sequences whose sum equals `total`, and who have `size` elements."""
def possible_sums(total, size):
if total == 0:
yield [0]*size
return
if size == 1:
yield [total]
return
for i in range(total+1):
left = [i]
for right in possible_sums(total-i, size-1):
yield left + right
"""
combines two lists a and b in order like:
[a[0], b[0], a[1], b[1]...]
"""
def interleave(a,b):
result = []
for pair in itertools.izip_longest(a,b):
for item in pair:
if item is not None:
result.append(item)
return result
"""flattens a list of lists into a one dimensional list"""
def flatten(seq):
return [x for item in seq for x in item]
def permutations(tup, limit):
one_segments = [[1]*size for size in tup]
for i in range(len(tup)-1):
one_segments[i].append(0)
remaining_zeroes = limit - sum(tup) - len(tup) + 1
assert remaining_zeroes >= 0, "not enough room to separate ranges!"
for gap_sizes in possible_sums(remaining_zeroes, len(tup)+1):
zero_segments = [[0]*size for size in gap_sizes]
yield flatten(interleave(zero_segments, one_segments))
for seq in permutations((4, 3), 10):
print seq

You can generate all list recursively.
F(tup, limit) =
[1, 1, ...1, 0] combine with all solutions of F(tup[1:], limit - len(tup[1]) - 1)
[0, 1 ,1 , ... 1, 0] combine with all solutions of F(tup[1:], limit - len(tup[1]) - 2)
.
.
.
if tup is empty return a list of zero
if sum(tup) + len(tup) - 1 > limit, return an empty list since there is no solution.
e.g. permutations((4,3,2), 10) shall return []
Otherwise, enumerating how many prefix zero there will be:
Generate prefix list which is [0, 0, 0 .. 0, 1, 1, ... 1, 0] The number of 1s is the value of first item in the tuple. Append additional 0 if it's not the last item of the tuple.
Call the function recursively for the rest element in the tuple to solve the similar sub-problem
Combine the prefix list with each solution of the sub-problem
Here is the code:
def permutations(tup, limit=100):
if len(tup) <= 0:
return [[0] * limit]
minimum_len = sum(tup) + len(tup) - 1
if minimum_len > limit:
return []
perms = []
for prefix_zero in range(0, limit - minimum_len + 1):
prefix = [0] * prefix_zero + [1] * tup[0]
if len(tup) > 1:
prefix += [0]
suffix_list = permutations(tup[1:], limit - len(prefix))
perms += [prefix + suffix for suffix in suffix_list] #combine the solutions
return perms

This solution creates all permutations of blocks of ones (a list defined by each entry in the tuple) with blocks of zeros (lists of length one) for the extra padding.
import itertools as it
spec = (1,2,3)
nBlocks = len(spec)
nZeros = 5
totalSize = sum(spec) + nZeros+1-nBlocks
blocks = [[1,]*s + [0,] for s in spec]
zeros = [[0,],]*(nZeros+1-nBlocks)
a = list(it.permutations(blocks + zeros, nZeros+1))
b = [list(it.chain.from_iterable(l))[:-1] for l in a]
for l in b:
print l

Without using itertools.
My shot at this, should be fairly quick, but uses a recursive generator (python recursion depth limit, here I come...).
# simple test case
seqs = (1, 2, 3)
length = 10
# '0' spots count
zeros = length - (sum(seqs))
# partitions count
partitions = len(seqs) + 1
# first and last can partitions have 0 zeros
# so use a flag when we call the function or check if it's the last partition
def generate_gaps(zeros_left, partition, first=False):
"""
:param zeros_left: how many zeros we can still use
:param partition: what partition is this
:param first: is this the first gap
:return: all possible gaps
"""
for gap in range((0 if first or partition == 0 else 1), zeros_left + 1):
if partition == 0:
if (zeros_left - gap) == 0:
yield [gap]
else:
for rest in generate_gaps(zeros_left - gap, partition - 1):
yield [gap] + rest
for gaps in generate_gaps(zeros, partitions - 1, True):
print "using gaps: " + str(gaps)
# merge lists
# zip gaps (0's) and sequences (1's) - all but last gap (added to result)
gaps_seqs = zip(gaps, seqs)
# expand everything... magic (could be done explicitly trivially).
result = sum(map(lambda x: [0] * x[0] + [1] * x[1], gaps_seqs)
# last gap (truncated from zip)
result = result + [[0] * gaps[-1]], [])

A simple non-recursive generator solution without itertools:
def fill_sequence(sequence, size):
n_slots = size - len(sequence)
for start in xrange(n_slots + 1):
yield [0]*start + sequence + [0]*(n_slots - start)
def all_placements(inner_sizes, outer_size):
x, y = inner_sizes
for margin in xrange(1, outer_size - sum(block_sizes) + 1):
sequence = [1]*x + [0]*margin + [1]*y
for p in fill_sequence(sequence, outer_size):
yield p
So that:
>>> list(all_placements((4,3), 10))
[[1, 1, 1, 1, 0, 1, 1, 1, 0, 0],
[0, 1, 1, 1, 1, 0, 1, 1, 1, 0],
[0, 0, 1, 1, 1, 1, 0, 1, 1, 1],
[1, 1, 1, 1, 0, 0, 1, 1, 1, 0],
[0, 1, 1, 1, 1, 0, 0, 1, 1, 1],
[1, 1, 1, 1, 0, 0, 0, 1, 1, 1]]
The idea is quite simple. Suppose you fix the number of zeros between your two blocks of ones, call it the margin. This gives you a 4 + margin + 3 sequence. You can easily place this sequence in the larger list of zeros using the approach you took in your post. Then simply iteratively increase the margin, yielding all possible placements.

Related

How do I set limits for my range function in python?

Say I am iterating through a list. I want to check if the list's neighbours (i-1 and i+1) contain a certain element. How do I do this without running into "list index out of range" problem?
Example:
list = [1, 0, 1, 0, 1, 0]
for i, j in enumerate(list):
elements = 0
for m in range(i-1,i+2):
if list[m] == 1:
elements += 1
print(list[i], elements)
How do I set boundaries for the range function, so that it doesn't go below 0 and above len(list)?
If you want to iterate for all elements in the target list, one solution is to check the value of second for loop:
_list = [1, 0, 1, 0, 1, 0]
elements = 0
for i, j in enumerate(_list):
for m in range(max(i-1, 0), min(i+2, len(_list))):
if _list[m] == 1:
elements += 1
Try slicing list from the top and bottom
list = [1, 0, 1, 0, 1, 0]
elements = 0
# slice list and start from second element and finish at the penultimate element
for i, j in enumerate(list[1:-1], 1):
for m in range(i-1,i+2):
if list[m] == 1:
elements += 1
or since you don't use list items in the outer loop, loop over range
elements = 0
# start from the second index and finish at the penultimate index
for i in range(1, len(list)-1):
for m in range(i-1,i+2):
if list[m] == 1:
elements += 1
Sounds like you want to use a window function. I got this somewhere here and have been using it over the years:
from typing import Generator
from itertools import islice
def window(seq, n: int = 2) -> Generator:
"""
Returns a sliding window (of width n) over data from the iterable
"""
it = iter(seq)
result = tuple(islice(it, n))
if len(result) == n:
yield result
for elem in it:
result = result[1:] + (elem,)
yield result
mylist = [1, 0, 1, 0, 1, 0, 5]
for chunk in window(mylist, n=3):
print(chunk)
This will give you :
(1, 0, 1)
(0, 1, 0)
(1, 0, 1)
(0, 1, 0)
(1, 0, 5)
Where you can compare the contents of the resulting 'window' however you like.

Simple method to find 0,0,1 in that order, within a given List

I am trying to write a simple function to find if 0,0,1 occurs in a list, in that order.
It should return True or False.
The list can contain any number of numbers.
For the function ZeroZeroOne examples would be as follows:
>> ZeroZeroOne( [0,0,1] )
>> True
>> ZeroZeroOne( [1,0,0] )
>> False
# there are 2s in between but the following does have 0,0,1 occurring and in correct order
>> ZeroZeroOne( [0,2,2,2,2,0,1] )
>> True
I have this function:
def ZeroZeroOne(nums):
FoundIt = False
#quick return if defo not possible
if (nums.count(0) < 2) and (nums.count(1) == 0):
return FoundIt
n = len(nums)
for x in range(n-2):
if nums[x] == 0:
for i,z in enumerate(nums[(x+1):]):
if z==0 and z!=1:
for j,q in enumerate(nums[(i+1):]):
if q==1 and q!=0:
FoundIt=True
return FoundIt
Why does the function return True for this list [0, 1, 0, 2, 1]?
Moreover....
This function seems overly-complex for a seemingly simple problem.
Is there a correct approach to this problem in Python - a canonical or Pythonic approach?
Or is ones approach simply opinion-based?
You can trivially modify the ordered subsequence test from this answer for an elegant solution:
def ZeroZeroOne(arr):
test = iter(a for a in arr if a in (0, 1))
return all(z in test for z in (0, 0, 1))
I realize now that you don't want to accept 0, 1 0, 1.
You can use itertools.tee to check for a match:
def ZeroZeroOne(arr):
e = itertools.tee((a for a in arr if a in (0, 1)), 3)
# move second iterator forward one
next(e[1])
# move third iterator forward two
next(e[2])
next(e[2])
return (0, 0, 1) in zip(*e)
The nice thing about using tee in this case is that it effectively maintains a rolling buffer of the last three elements for you. You don't need to make a new slice or loop over indices it anything like that.
Just for fun, here's a more general solution in pure python. It accepts any iterable for arr and template:
def contains_template(arr, template):
template = tuple(template)
unique = set(template)
filtered = (a for a in arr if a in unique)
e = itertools.tee(filtered, len(template))
for n, it in enumerate(e):
for _ in range(n):
next(it)
return template in zip(*e)
While itertools.tee is a nice way to maintain a rolling buffer, you can implement the same thing using a list (or more efficiently, collections.deque):
def contains_template(arr, template):
template = list(template)
unique = set(template)
filtered = (a for a in arr if a in unique)
buffer = [next(filtered) for _ in range(len(template) - 1)]
buffer.insert(0, None)
for e in filtered:
buffer.pop(0)
buffer.append(e)
if template == buffer:
return True
return False
Finally, here is the really simple solution, without a rolling buffer:
def contains_template(arr, template):
template = list(template)
n = len(template)
unique = set(template)
filtered = [a for a in arr if a in unique]
return any(filtered[i:i + n] == template for i in range(len(filtered) - n))
You can also do it with a recursive function :
def check(seq, liste, i=0, j=0):
if i >= len(seq):
return True
if j >= len(liste):
return False
if seq[i] == liste[j]:
return check(seq, liste, i + 1, j + 1)
elif liste[j] in seq:
# look for the last index you can restart from
for k in range(i - 1, -1, -1):
if seq[k] == liste[j]:
if seq[:k] == seq[i - k:i]:
ind = k
break
else:
ind = 0
return check(seq, liste, ind, j + (not i))
else:
return check(seq, liste, i, j + 1)
# seq = [0,0,1] for ZeroZeroOne
print(check([0, 0, 1], [0, 0, 0, 0, 1])) # True
print(check([0, 0, 1], [0, 200, 0, 0, 101, 1])) # True
print(check([0, 2, 2, 0, 1], [0, 2, 0, 4, 2, 5, 2, 0, 3, 1])) # True
print(check([0, 2, 2, 0, 1], [0, 2, 4, 2, 5, 2, 0, 3, 1])) # False
You can achieve this with a single loop - O(n) time complexity. Since it is for this specific case. Try the code below.
def ZeroZeroOne(nums):
found_pattern = []
for num in nums:
if num == 1:
found_pattern.append(1)
if len(found_pattern) == 3:
return True
else:
found_pattern = []
elif num == 0 and len(found_pattern) < 2:
found_pattern.append(0)
return False
print(ZeroZeroOne([0, 0, 1]))
print(ZeroZeroOne([0, 1, 0, 2, 1]))
print(ZeroZeroOne([0, 2, 0, 1]))
print(ZeroZeroOne([0, 0, 0, 1]))
print(ZeroZeroOne([0, 2, 2, 2, 2, 0, 1]))
But I think you can generalize this as well if required. Probably you need to look in to how grep works and modify it for your use case if you want a generic approach.
I think this does what you want :)
def ZeroZeroOne(arr):
dropped = [x for x in arr if x==0 or x==1]
slices = [dropped[i:i+3] for i in range(len(dropped)-2)]
if [0,0,1] in slices: return True
else: return False
def ZeroZeroOne(nums):
filtered_nums = [x for x in nums if x in [0,1]]
return '*'.join([str(x) for x in [0,0,1]) in '*'.join([str(x) for x in filtered_nums])

Partition a number into a given set of numbers

Here is what I am trying to do. Given a number and a set of numbers, I want to partition that number into the numbers given in the set (with repetitions).
For example :
take the number 9, and the set of numbers = {1, 4, 9}.
It will yield the following partitions :
{ (1, 1, 1, 1, 1, 1, 1, 1, 1), (1, 1, 1, 1, 1, 4), (1, 4, 4), (9,)}
No other possible partitions using the set {1, 4, 9} cannot be formed to sum the number 9.
I wrote a function in Python which do the task :
S = [ 1, 4, 9, 16 ]
def partition_nr_into_given_set_of_nrs(nr , S):
lst = set()
# Build the base case :
M = [1]*(nr%S[0]) + [S[0]] * (nr //S[0])
if set(M).difference(S) == 0 :
lst.add(M)
else :
for x in S :
for j in range(1, len(M)+1):
for k in range(1, nr//x +1 ) :
if k*x == sum(M[:j]) :
lst.add( tuple(sorted([x]*k + M[j:])) )
return lst
It works correctly but I want to see some opinions about it. I'm not satisfied about the fact that it uses 3 loops and I guess that it can be improved in a more elegant way. Maybe recursion is more suited in this case. Any suggestions or corrections would be appreciated. Thanks in advance.
I would solve this using a recursive function, starting with the largest number and recursively finding solutions for the remaining value, using smaller and smaller numbers.
def partition_nr_into_given_set_of_nrs(nr, S):
nrs = sorted(S, reverse=True)
def inner(n, i):
if n == 0:
yield []
for k in range(i, len(nrs)):
if nrs[k] <= n:
for rest in inner(n - nrs[k], k):
yield [nrs[k]] + rest
return list(inner(nr, 0))
S = [ 1, 4, 9, 16 ]
print(partition_nr_into_given_set_of_nrs(9, S))
# [[9], [4, 4, 1], [4, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]
Of course you could also do without the inner function by changing the parameters of the function and assuming that the list is already sorted in reverse order.
If you want to limit the number of parts for large numbers, you can add an aditional parameter indicating the remaining allowed number of elements and only yield result if that number is still greater than zero.
def partition_nr_into_given_set_of_nrs(nr, S, m=10):
nrs = sorted(S, reverse=True)
def inner(n, i, m):
if m > 0:
if n == 0:
yield []
for k in range(i, len(nrs)):
if nrs[k] <= n:
for rest in inner(n - nrs[k], k, m - 1):
yield [nrs[k]] + rest
return list(inner(nr, 0, m))
Here is a solution using itertools and has two for loops so time complexity is about O(n*n) (roughly)
A little memoization applied to reshape list by removing any element that is greater than max sum needed.
Assuming you are taking sum to be max of your set (9 in this case).
sourceCode
import itertools
x = [ 1, 4, 9, 16 ]
s = []
n = 9
#Remove elements >9
x = [ i for i in x if i <= n]
for i in xrange(1,n + 1):
for j in itertools.product(x,repeat = i):
if sum(j) == n:
s.append(list(j))
#Sort each combo
s =[sorted(i) for i in s]
#group by unique combo
print list(k for k,_ in itertools.groupby(s))
Result
>>>
>>>
[[9], [1, 4, 4], [1, 1, 1, 1, 1, 4], [1, 1, 1, 1, 1, 1, 1, 1, 1]]
EDIT
You can further optimize speed (if needed) by stopping finding combo's after sum of product is > 9
e.g.
if sum(j) > n + 2:
break

find the start position of the longest sequence of 1's

I want to find the start position of the longest sequence of 1's in my array:
a1=[0,0,1,1,1,1,0,0,1,1]
#2
I am following this answer to find the length of the longest sequence. However, I was not able to determine the position.
Inspired by this solution, here's a vectorized approach to solve it -
# Get start, stop index pairs for islands/seq. of 1s
idx_pairs = np.where(np.diff(np.hstack(([False],a1==1,[False]))))[0].reshape(-1,2)
# Get the island lengths, whose argmax would give us the ID of longest island.
# Start index of that island would be the desired output
start_longest_seq = idx_pairs[np.diff(idx_pairs,axis=1).argmax(),0]
Sample run -
In [89]: a1 # Input array
Out[89]: array([0, 0, 1, 1, 1, 1, 0, 0, 1, 1])
In [90]: idx_pairs # Start, stop+1 index pairs
Out[90]:
array([[ 2, 6],
[ 8, 10]])
In [91]: np.diff(idx_pairs,axis=1) # Island lengths
Out[91]:
array([[4],
[2]])
In [92]: np.diff(idx_pairs,axis=1).argmax() # Longest island ID
Out[92]: 0
In [93]: idx_pairs[np.diff(idx_pairs,axis=1).argmax(),0] # Longest island start
Out[93]: 2
A more compact one-liner using groupby(). Uses enumerate() on the raw data to keep the starting positions through the analysis pipeline, evenutally ending up with the list of tuples [(2, 4), (8, 2)] each tuple containing the starting position and length of non-zero runs:
from itertools import groupby
L = [0,0,1,1,1,1,0,0,1,1]
print max(((lambda y: (y[0][0], len(y)))(list(g)) for k, g in groupby(enumerate(L), lambda x: x[1]) if k), key=lambda z: z[1])[0]
lambda: x is the key function for groupby() since we enumerated L
lambda: y packages up results we need since we can only evaluate g once, without saving
lambda: z is the key function for max() to pull out the lengths
Prints '2' as expected.
This seems to work, using groupby from itertools, this only goes through the list once:
from itertools import groupby
pos, max_len, cum_pos = 0, 0, 0
for k, g in groupby(a1):
if k == 1:
pat_size = len(list(g))
pos, max_len = (pos, max_len) if pat_size < max_len else (cum_pos, pat_size)
cum_pos += pat_size
else:
cum_pos += len(list(g))
pos
# 2
max_len
# 4
You could use a for loop and check if the next few items (of length m where m is the max length) are the same as the maximum length:
# Using your list and the answer from the post you referred
from itertools import groupby
L = [0,0,1,1,1,1,0,0,1,1]
m = max(sum(1 for i in g) for k, g in groupby(L))
# Here is the for loop
for i, s in enumerate(L):
if len(L) - i + 2 < len(L) - m:
break
if s == 1 and 0 not in L[i:i+m]:
print i
break
This will give:
2
Another way of doing in a single loop, but without resorting to itertool's groupby.
max_start = 0
max_reps = 0
start = 0
reps = 0
for (pos, val) in enumerate(a1):
start = pos if reps == 0 else start
reps = reps + 1 if val == 1 else 0
max_reps = max(reps, max_reps)
max_start = start if reps == max_reps else max_start
This could also be done in a one-liner fashion using reduce:
max_start = reduce(lambda (max_start, max_reps, start, reps), (pos, val): (start if reps == max(reps, max_reps) else max_start, max(reps, max_reps), pos if reps == 0 else start, reps + 1 if val == 1 else 0), enumerate(a1), (0, 0, 0, 0))[0]
In Python 3, you cannot unpack tuples inside the lambda arguments definition, so it's preferable to define the function using def first:
def func(acc, x):
max_start, max_reps, start, reps = acc
pos, val = x
return (start if reps == max(reps, max_reps) else max_start,
max(reps, max_reps),
pos if reps == 0 else start,
reps + 1 if val == 1 else 0)
max_start = reduce(func, enumerate(a1), (0, 0, 0, 0))[0]
In any of the three cases, max_start gives your answer (i.e. 2).
Using more_itertools, a third-party library:
Given
import itertools as it
import more_itertools as mit
lst = [0, 0, 1, 1, 1, 1, 0, 0, 1, 1]
Code
longest_contiguous = max([tuple(g) for _, g in it.groupby(lst)], key=len)
longest_contiguous
# (1, 1, 1, 1)
pred = lambda w: w == longest_contiguous
next(mit.locate(mit.windowed(lst, len(longest_contiguous)), pred=pred))
# 2
See also the more_itertools.locate docstring for details on how these tools work.
For another solution that uses only Numpy, I think this should work in all the cases. The most upvoted solution is probably faster though.
tmp = np.cumsum(np.insert(np.array(a1) != 1, 0, False)) # value of tmp[i+1] was not incremented when a1[i] is 1
# [0, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]
values, counts = np.unique(tmp, return_counts=True)
# [0, 1, 2, 3, 4], [1, 1, 5, 1, 3]
counts_idx = np.argmax(counts)
longest_sequence_length = counts[counts_idx] - 1
# 4
longest_sequence_idx = np.argmax(tmp == values[counts_idx])
# 2
I've implemented a run-searching function for numpy arrays in haggis.npy_util.mask2runs. You can use it like this:
runs, lengths = mask2runs(a1, return_lengths=True)
result = runs[lengths.argmax(), 0]

Detect whether sequence is a multiple of a subsequence in Python

I have a tuple of zeros and ones, for instance:
(1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1)
It turns out:
(1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1) == (1, 0, 1, 1) * 3
I want a function f such that if s is a non-empty tuple of zeros and ones, f(s) is the shortest subtuple r such that s == r * n for some positive integer n.
So for instance,
f( (1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1) ) == (1, 0, 1, 1)
What is a slick way to write the function f in Python?
Edit:
The naive method I am currently using
def f(s):
for i in range(1,len(s)):
if len(s)%i == 0 and s == s[:i] * (len(s)/i):
return s[:i]
I believe I have an O(n) time solution (actually 2n+r, n is length of tuple, r is sub tuplle) which does not use suffix trees, but uses a string matching algorithm (like KMP, which you should find off-the shelf).
We use the following little known theorem:
If x,y are strings over some alphabet,
then xy = yx if and only if x = z^k and y = z^l for some string z and integers k,l.
I now claim that, for the purposes of our problem, this means that all we need to do is determine if the given tuple/list (or string) is a cyclic shift of itself!
To determine if a string is a cyclic shift of itself, we concatenate it with itself (it does not even have to be a real concat, just a virtual one will do) and check for a substring match (with itself).
For a proof of that, suppose the string is a cyclic shift of itself.
The we have that the given string y = uv = vu.
Since uv = vu, we must have that u = z^k and v= z^l and hence y = z^{k+l} from the above theorem. The other direction is easy to prove.
Here is the python code. The method is called powercheck.
def powercheck(lst):
count = 0
position = 0
for pos in KnuthMorrisPratt(double(lst), lst):
count += 1
position = pos
if count == 2:
break
return lst[:position]
def double(lst):
for i in range(1,3):
for elem in lst:
yield elem
def main():
print powercheck([1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1])
if __name__ == "__main__":
main()
And here is the KMP code which I used (due to David Eppstein).
# Knuth-Morris-Pratt string matching
# David Eppstein, UC Irvine, 1 Mar 2002
def KnuthMorrisPratt(text, pattern):
'''Yields all starting positions of copies of the pattern in the text.
Calling conventions are similar to string.find, but its arguments can be
lists or iterators, not just strings, it returns all matches, not just
the first one, and it does not need the whole text in memory at once.
Whenever it yields, it will have read the text exactly up to and including
the match that caused the yield.'''
# allow indexing into pattern and protect against change during yield
pattern = list(pattern)
# build table of shift amounts
shifts = [1] * (len(pattern) + 1)
shift = 1
for pos in range(len(pattern)):
while shift <= pos and pattern[pos] != pattern[pos-shift]:
shift += shifts[pos-shift]
shifts[pos+1] = shift
# do the actual search
startPos = 0
matchLen = 0
for c in text:
while matchLen == len(pattern) or \
matchLen >= 0 and pattern[matchLen] != c:
startPos += shifts[matchLen]
matchLen -= shifts[matchLen]
matchLen += 1
if matchLen == len(pattern):
yield startPos
For your sample this outputs
[1,0,1,1]
as expected.
I compared this against shx2's code(not the numpy one), by generating a random 50 bit string, then replication to make the total length as 1 million. This was the output (the decimal number is the output of time.time())
1362988461.75
(50, [1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1])
1362988465.96
50 [1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1]
1362988487.14
The above method took ~4 seconds, while shx2's method took ~21 seconds!
Here was the timing code. (shx2's method was called powercheck2).
def rand_bitstring(n):
rand = random.SystemRandom()
lst = []
for j in range(1, n+1):
r = rand.randint(1,2)
if r == 2:
lst.append(0)
else:
lst.append(1)
return lst
def main():
lst = rand_bitstring(50)*200000
print time.time()
print powercheck(lst)
print time.time()
powercheck2(lst)
print time.time()
The following solution is O(N^2), but has the advantage of not creating any copies (or slices) of your data, as it is based on iterators.
Depending on the size of your input, the fact you avoid making copies of the data can result in a significant speed-up, but of course, it would not scale as well for huge inputs as algorithms with lower complexity (e.g. O(N*logN)).
[This is the second revision of my solution, the first one is given below. This one is simpler to understand, and is more along the lines of OP's tuple-multiplication, only using iterators.]
from itertools import izip, chain, tee
def iter_eq(seq1, seq2):
""" assumes the sequences have the same len """
return all( v1 == v2 for v1, v2 in izip(seq1, seq2) )
def dup_seq(seq, n):
""" returns an iterator which is seq chained to itself n times """
return chain(*tee(seq, n))
def is_reps(arr, slice_size):
if len(arr) % slice_size != 0:
return False
num_slices = len(arr) / slice_size
return iter_eq(arr, dup_seq(arr[:slice_size], num_slices))
s = (1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1)
for i in range(1,len(s)):
if is_reps(s, i):
print i, s[:i]
break
[My original solution]
from itertools import islice
def is_reps(arr, num_slices):
if len(arr) % num_slices != 0:
return False
slice_size = len(arr) / num_slices
for i in xrange(slice_size):
if len(set( islice(arr, i, None, num_slices) )) > 1:
return False
return True
s = (1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1)
for i in range(1,len(s)):
if is_reps(s, i):
print i, s[:i]
break
You can avoid the call to set() by using something like:
def is_iter_unique(seq):
""" a faster version of testing len(set(seq)) <= 1 """
seen = set()
for x in seq:
seen.add(x)
if len(seen) > 1:
return False
return True
and replacing this line:
if len(set( islice(arr, i, None, num_slices) )) > 1:
with:
if not is_iter_unique(islice(arr, i, None, num_slices)):
Simplifying Knoothe's solution. His algorithm is right, but his implementation is too complex. This implementation is also O(n).
Since your array is only composed of ones and zeros, what I do is use existing str.find implementation (Bayer Moore) to implement Knoothe's idea. It's suprisingly simpler and amazingly faster at runtime.
def f(s):
s2 = ''.join(map(str, s))
return s[:(s2+s2).index(s2, 1)]
Here's another solution (competing with my earlier iterators-based solution), leveraging numpy.
It does make a (single) copy of your data, but taking advantage of the fact your values are 0s and 1s, it is super-fast, thanks to numpy's magics.
import numpy as np
def is_reps(arr, slice_size):
if len(arr) % slice_size != 0:
return False
arr = arr.reshape((-1, slice_size))
return (arr.all(axis=0) | (~arr).all(axis=0)).all()
s = (1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1) * 1000
a = np.array(s, dtype=bool)
for i in range(1,len(s)):
if is_reps(a, i):
print i, s[:i]
break
Just a different approach to the problem
I first determine all the factors of the length and then split the list and check if all the parts are same
>>> def f(s):
def factors(n):
#http://stackoverflow.com/a/6800214/977038
return set(reduce(list.__add__,
([i, n//i] for i in range(2, int(n**0.5) + 1) if n % i == 0)))
_len = len(s)
for fact in reversed(list(factors(_len))):
compare_set = set(izip(*[iter(s)]*fact))
if len(compare_set) == 1:
return compare_set
>>> f(t)
set([(1, 0, 1, 1)])
You can archive it in sublinear time by XOR'ing the rotated binary form for the input array:
get the binary representation of the array, input_binary
loop from i = 1 to len(input_array)/2, and for each loop, rotate the input_binary to the right by i bits, save it as rotated_bin, then compare the XOR of rotated_bin and input_binary.
The first i that yields 0, is the index to which is the desired substring.
Complete code:
def get_substring(arr):
binary = ''.join(map(str, arr)) # join the elements to get the binary form
for i in xrange(1, len(arr) / 2):
# do a i bit rotation shift, get bit string sub_bin
rotated_bin = binary[-i:] + binary[:-i]
if int(rotated_bin) ^ int(binary) == 0:
return arr[0:i]
return None
if __name__ == "__main__":
test = [1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1]
print get_substring(test) # [1,0,1,1]
This one is just a dumb recursive comparison in Haskell. It takes about one second for Knoothe's million long string (f a). Cool problem! I'll think about it some more.
a = concat $ replicate 20000
[1,1,1,0,0,1,0,1,0,0,1,0,0,1,1,1,0,0,
0,0,0,0,1,1,1,1,0,0,0,1,1,0,1,1,1,1,
1,1,1,0,0,1,1,1,0,0,0,0,0,1]
f s =
f' s [] where
f' [] result = []
f' (x:xs) result =
let y = result ++ [x]
in if concat (replicate (div (length s) (length y)) y) == s
then y
else f' xs y

Categories

Resources