Given a list of data, I'm trying to create a new list in which the value at position i is the length of the longest run starting from position i in the original list. For instance, given
x_list = [1, 1, 2, 3, 3, 3]
Should return:
run_list = [2, 1, 1, 3, 2, 1]
My solution:
freq_list = []
current = x_list[0]
count = 0
for num in x_list:
if num == current:
count += 1
else:
freq_list.append((current,count))
current = num
count = 1
freq_list.append((current,count))
run_list = []
for i in freq_list:
z = i[1]
while z > 0:
run_list.append(z)
z -= 1
Firstly I create a list freq_list of tuples, where every tuple's first element is the element from x_list, and where the second element is the number of the total run.
In this case:
freq_list = [(1, 2), (2, 1), (3, 3)]
Having this, I create a new list and append appropriate values.
However, I was wondering if there is a shorter way/another way to do this?
Here's a simple solution that iterates over the list backwards and increments a counter each time a number is repeated:
last_num = None
result = []
for num in reversed(x_list):
if num != last_num:
# if the number changed, reset the counter to 1
counter = 1
last_num = num
else:
# if the number is the same, increment the counter
counter += 1
result.append(counter)
# reverse the result
result = list(reversed(result))
Result:
[2, 1, 1, 3, 2, 1]
This is possible using itertools:
from itertools import groupby, chain
x_list = [1, 1, 2, 3, 3, 3]
gen = (range(len(list(j)), 0, -1) for _, j in groupby(x_list))
res = list(chain.from_iterable(gen))
Result
[2, 1, 1, 3, 2, 1]
Explanation
First use itertools.groupby to group identical items in your list.
For each item in your groupby, create a range object which counts backwards from the length of the number of consecutive items to 1.
Turn this all into a generator to avoid building a list of lists.
Use itertools.chain to chain the ranges from the generator.
Performance note
Performance will be inferior to #Aran-Fey's solution. Although itertools.groupby is O(n), it makes heavy use of expensive __next__ calls. These do not scale as well as iteration in simple for loops. See itertools docs for groupby pseudo-code.
If performance is your main concern, stick with the for loop.
You are performing a reverse cumulative count on contiguous groups. We can create a Numpy cumulative count function with
import numpy as np
def cumcount(a):
a = np.asarray(a)
b = np.append(False, a[:-1] != a[1:])
c = b.cumsum()
r = np.arange(len(a))
return r - np.append(0, np.flatnonzero(b))[c] + 1
and then generate our result with
a = np.array(x_list)
cumcount(a[::-1])[::-1]
array([2, 1, 1, 3, 2, 1])
I would use a generator for this kind of task because it avoids building the resulting list incrementally and can be used lazily if one wanted:
def gen(iterable): # you have to think about a better name :-)
iterable = iter(iterable)
# Get the first element, in case that fails
# we can stop right now.
try:
last_seen = next(iterable)
except StopIteration:
return
count = 1
# Go through the remaining items
for item in iterable:
if item == last_seen:
count += 1
else:
# The consecutive run finished, return the
# desired values for the run and then reset
# counter and the new item for the next run.
yield from range(count, 0, -1)
count = 1
last_seen = item
# Return the result for the last run
yield from range(count, 0, -1)
This will also work if the input cannot be reversed (certain generators/iterators cannot be reversed):
>>> x_list = (i for i in range(10)) # it's a generator despite the variable name :-)
>>> ... arans solution ...
TypeError: 'generator' object is not reversible
>>> list(gen((i for i in range(10))))
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
And it works for your input:
>>> x_list = [1, 1, 2, 3, 3, 3]
>>> list(gen(x_list))
[2, 1, 1, 3, 2, 1]
This can actually be made simpler by using itertools.groupby:
import itertools
def gen(iterable):
for _, group in itertools.groupby(iterable):
length = sum(1 for _ in group) # or len(list(group))
yield from range(length, 0, -1)
>>> x_list = [1, 1, 2, 3, 3, 3]
>>> list(gen(x_list))
[2, 1, 1, 3, 2, 1]
I also did some benchmarks and according to these Aran-Feys solution is the fastest except for long lists where piRSquareds solution wins:
This was my benchmarking setup if you want to confirm the results:
from itertools import groupby, chain
import numpy as np
def gen1(iterable):
iterable = iter(iterable)
try:
last_seen = next(iterable)
except StopIteration:
return
count = 1
for item in iterable:
if item == last_seen:
count += 1
else:
yield from range(count, 0, -1)
count = 1
last_seen = item
yield from range(count, 0, -1)
def gen2(iterable):
for _, group in groupby(iterable):
length = sum(1 for _ in group)
yield from range(length, 0, -1)
def mseifert1(iterable):
return list(gen1(iterable))
def mseifert2(iterable):
return list(gen2(iterable))
def aran(x_list):
last_num = None
result = []
for num in reversed(x_list):
if num != last_num:
counter = 1
last_num = num
else:
counter += 1
result.append(counter)
return list(reversed(result))
def jpp(x_list):
gen = (range(len(list(j)), 0, -1) for _, j in groupby(x_list))
res = list(chain.from_iterable(gen))
return res
def cumcount(a):
a = np.asarray(a)
b = np.append(False, a[:-1] != a[1:])
c = b.cumsum()
r = np.arange(len(a))
return r - np.append(0, np.flatnonzero(b))[c] + 1
def pirsquared(x_list):
a = np.array(x_list)
return cumcount(a[::-1])[::-1]
from simple_benchmark import benchmark
import random
funcs = [mseifert1, mseifert2, aran, jpp, pirsquared]
args = {2**i: [random.randint(0, 5) for _ in range(2**i)] for i in range(1, 20)}
bench = benchmark(funcs, args, "list size")
%matplotlib notebook
bench.plot()
Python 3.6.5, NumPy 1.14
Here's a simple iterative approach to achieve it using collections.Counter:
from collections import Counter
x_list = [1, 1, 2, 3, 3, 3]
x_counter, run_list = Counter(x_list), []
for x in x_list:
run_list.append(x_counter[x])
x_counter[x] -= 1
which will return you run_list as:
[2, 1, 1, 3, 2, 1]
As an alternative, here's one-liner to achieve this using list comprehension with enumerate but it is not performance efficient due to iterative usage of list.index(..):
>>> [x_list[i:].count(x) for i, x in enumerate(x_list)]
[2, 1, 1, 3, 2, 1]
You can count the consecutive equal items and then add a countdown from count-of-items to 1 to the result:
def runs(p):
old = p[0]
n = 0
q = []
for x in p:
if x == old:
n += 1
else:
q.extend(range(n, 0, -1))
n = 1
old = x
q.extend(range(n, 0, -1))
return q
(A couple of minutes later) Oh, that's the same as MSeifert's code but without the iterable aspect. This version seems to be almost as fast as the method shown by Aran-Fey.
Related
Hey this is my first question so I hope I'm doing it right.
I'm trying to write a function that given a list of integers and N as the maximum occurrence, would then return a list with any integer above the maximum occurrence deleted. For example if I input:
[20,37,20,21] #list of integers and 1 #maximum occurrence.
Then as output I would get:
[20,37,21] because the number 20 appears twice and the maximum occurrence is 1, so it is deleted from the list. Here's another example:
Input: [1,1,3,3,7,2,2,2,2], 3
Output: [1,1,3,3,7,2,2,2]
Here's what I wrote so far, how would I be able to optimize it? I keep on getting a timeout error. Thank you very much in advance.
def delete_nth(order,n):
order = Counter(order)
for i in order:
if order[i] > n:
while order[i] > n:
order[i] - 1
return order
print(delete_nth([20,37,20,21], 1))
You can remove building the Counter at the beginning - and just have temporary dictionary as counter:
def delete_nth(order,n):
out, counter = [], {}
for v in order:
counter.setdefault(v, 0)
if counter[v] < n:
out.append(v)
counter[v] += 1
return out
print(delete_nth([20,37,20,21], 1))
Prints:
[20, 37, 21]
You wrote:
while order[i] > n:
order[i] - 1
That second line should presumably be order[i] -= 1, or any code that enters the loop will never leave it.
You could use a predicate with a default argument collections.defaultdict to retain state as your list of numbers is being filtered.
def delete_nth(numbers, n):
from collections import defaultdict
def predicate(number, seen=defaultdict(int)):
seen[number] += 1
return seen[number] <= n
return list(filter(predicate, numbers))
print(delete_nth([1, 1, 3, 3, 7, 2, 2, 2, 2], 3))
Output:
[1, 1, 3, 3, 7, 2, 2, 2]
>>>
I've renamed variables to something that had more meaning for me:
This version, though very short and fairly efficient, will output identical values adjacently:
from collections import Counter
def delete_nth(order, n):
counters = Counter(order)
output = []
for value in counters:
cnt = min(counters[value], n)
output.extend([value] * cnt)
return output
print(delete_nth([1,1,2,3,3,2,7,2,2,2,2], 3))
print(delete_nth([20,37,20,21], 1))
Prints:
[1, 1, 2, 2, 2, 3, 3, 7]
[20, 37, 21]
This version will maintain original order, but run a bit more slowly:
from collections import Counter
def delete_nth(order, n):
counters = Counter(order)
for value in counters:
counters[value] = min(counters[value], n)
output = []
for value in order:
if counters[value]:
output.append(value)
counters[value] -= 1
return output
print(delete_nth([1,1,2,3,3,2,7,2,2,2,2], 3))
print(delete_nth([20,37,20,21], 1))
Prints:
[1, 1, 2, 3, 3, 2, 7, 2]
[20, 37, 21]
Input:Given items=[1,2,3] and values=[100,300,800] OR it can be in dictionary={1:100,2:300,3:800}.
Find all combinations items such that sum values is less than 500
Solution:
[1]
[2]
[1,2]
This has to be done for millions of inputs.
WHat is the best and fastest algorithm to implement this??
import copy
dictionary = {
100: 1,
200: 2,
800: 3,
}
value = sorted([100, 200, 800])
threshold = 500
res = []
def dfs(l, s, cur):
if s < threshold:
if len(l) > 0:
res.append(l)
else:
return
for i in range(cur + 1, len(value)):
tmp = copy.copy(l)
tmp.append(dictionary.get(value[i]))
dfs(tmp, s+value[i], i)
dfs([], 0, -1)
print res
Time complexity is O(K). K is number of correct result.
A much more efficient method is to use breadth-first-search and avoid enqueueing any further if the current item value plus the current sum already reaches the limit, so that in a value list of [1, 2, 3, 4, 5] and a limit of 5, if the current combination of values is [1, 2] and the current item value is 3, then since we find that 1 + 2 + 3 is already no less than 5, we will not enqueue [1, 2, 3] for further search. This drastically cuts down on the number of combinations we need to test:
from collections import deque
def sums_less_than(items, values, limit):
seeds = [(index, 0, [], item_value) for index, item_value in enumerate(zip(items, values))]
queue = deque(seeds)
while queue:
index, _sum, combination, (item, value) = queue.popleft()
new_sum = _sum + value
if new_sum < limit:
new_combination = combination + [item]
yield new_combination
for i in range(index + 1, len(seeds)):
queue.append((i, new_sum, new_combination, seeds[i][-1]))
so that:
items=[1,2,3]
values=[100,300,800]
print(list(sums_less_than(items, values, 500)))
will output:
[[1], [2], [1, 2]]
You can use itertools.combinations on a zipped sequence of items and values after filtering out values that are greater than the limit first:
from itertools import combinations
items=[1,2,3]
values=[100,300,800]
def sums_less_than(items, values, limit):
filtered = [(item, value) for item, value in zip(items, values) if value < limit]
return [[item for item, _ in c] for n in range(1, len(filtered) + 1) for c in combinations(filtered, n) if sum(value for _, value in c) < limit]
print(sums_less_than(items, values, 500))
This outputs:
[[1], [2], [1, 2]]
With modificaton:
import copy
dictionary = {
100: 1,
200: 2,
800: 3,
50 : 4,
}
value = sorted(dictionary.keys())
threshold = 500
res = []
thres_val=[]
def dfs(l, s, cur):
if s < threshold:
if len(l) > 0:
res.append(l)
thres_val.append(s)
else:
return
for i in range(cur + 1, len(value)):
tmp = copy.copy(l)
tmp.append(dictionary.get(value[i]))
dfs(tmp, s+value[i], i)
dfs([], 0, -1)
print(res)
print(thres_val)
print("\tItem list-->Value")
j=0
for i in res:
print("\t",i,"-->",thres_val[j])`
I want to find the start position of the longest sequence of 1's in my array:
a1=[0,0,1,1,1,1,0,0,1,1]
#2
I am following this answer to find the length of the longest sequence. However, I was not able to determine the position.
Inspired by this solution, here's a vectorized approach to solve it -
# Get start, stop index pairs for islands/seq. of 1s
idx_pairs = np.where(np.diff(np.hstack(([False],a1==1,[False]))))[0].reshape(-1,2)
# Get the island lengths, whose argmax would give us the ID of longest island.
# Start index of that island would be the desired output
start_longest_seq = idx_pairs[np.diff(idx_pairs,axis=1).argmax(),0]
Sample run -
In [89]: a1 # Input array
Out[89]: array([0, 0, 1, 1, 1, 1, 0, 0, 1, 1])
In [90]: idx_pairs # Start, stop+1 index pairs
Out[90]:
array([[ 2, 6],
[ 8, 10]])
In [91]: np.diff(idx_pairs,axis=1) # Island lengths
Out[91]:
array([[4],
[2]])
In [92]: np.diff(idx_pairs,axis=1).argmax() # Longest island ID
Out[92]: 0
In [93]: idx_pairs[np.diff(idx_pairs,axis=1).argmax(),0] # Longest island start
Out[93]: 2
A more compact one-liner using groupby(). Uses enumerate() on the raw data to keep the starting positions through the analysis pipeline, evenutally ending up with the list of tuples [(2, 4), (8, 2)] each tuple containing the starting position and length of non-zero runs:
from itertools import groupby
L = [0,0,1,1,1,1,0,0,1,1]
print max(((lambda y: (y[0][0], len(y)))(list(g)) for k, g in groupby(enumerate(L), lambda x: x[1]) if k), key=lambda z: z[1])[0]
lambda: x is the key function for groupby() since we enumerated L
lambda: y packages up results we need since we can only evaluate g once, without saving
lambda: z is the key function for max() to pull out the lengths
Prints '2' as expected.
This seems to work, using groupby from itertools, this only goes through the list once:
from itertools import groupby
pos, max_len, cum_pos = 0, 0, 0
for k, g in groupby(a1):
if k == 1:
pat_size = len(list(g))
pos, max_len = (pos, max_len) if pat_size < max_len else (cum_pos, pat_size)
cum_pos += pat_size
else:
cum_pos += len(list(g))
pos
# 2
max_len
# 4
You could use a for loop and check if the next few items (of length m where m is the max length) are the same as the maximum length:
# Using your list and the answer from the post you referred
from itertools import groupby
L = [0,0,1,1,1,1,0,0,1,1]
m = max(sum(1 for i in g) for k, g in groupby(L))
# Here is the for loop
for i, s in enumerate(L):
if len(L) - i + 2 < len(L) - m:
break
if s == 1 and 0 not in L[i:i+m]:
print i
break
This will give:
2
Another way of doing in a single loop, but without resorting to itertool's groupby.
max_start = 0
max_reps = 0
start = 0
reps = 0
for (pos, val) in enumerate(a1):
start = pos if reps == 0 else start
reps = reps + 1 if val == 1 else 0
max_reps = max(reps, max_reps)
max_start = start if reps == max_reps else max_start
This could also be done in a one-liner fashion using reduce:
max_start = reduce(lambda (max_start, max_reps, start, reps), (pos, val): (start if reps == max(reps, max_reps) else max_start, max(reps, max_reps), pos if reps == 0 else start, reps + 1 if val == 1 else 0), enumerate(a1), (0, 0, 0, 0))[0]
In Python 3, you cannot unpack tuples inside the lambda arguments definition, so it's preferable to define the function using def first:
def func(acc, x):
max_start, max_reps, start, reps = acc
pos, val = x
return (start if reps == max(reps, max_reps) else max_start,
max(reps, max_reps),
pos if reps == 0 else start,
reps + 1 if val == 1 else 0)
max_start = reduce(func, enumerate(a1), (0, 0, 0, 0))[0]
In any of the three cases, max_start gives your answer (i.e. 2).
Using more_itertools, a third-party library:
Given
import itertools as it
import more_itertools as mit
lst = [0, 0, 1, 1, 1, 1, 0, 0, 1, 1]
Code
longest_contiguous = max([tuple(g) for _, g in it.groupby(lst)], key=len)
longest_contiguous
# (1, 1, 1, 1)
pred = lambda w: w == longest_contiguous
next(mit.locate(mit.windowed(lst, len(longest_contiguous)), pred=pred))
# 2
See also the more_itertools.locate docstring for details on how these tools work.
For another solution that uses only Numpy, I think this should work in all the cases. The most upvoted solution is probably faster though.
tmp = np.cumsum(np.insert(np.array(a1) != 1, 0, False)) # value of tmp[i+1] was not incremented when a1[i] is 1
# [0, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4]
values, counts = np.unique(tmp, return_counts=True)
# [0, 1, 2, 3, 4], [1, 1, 5, 1, 3]
counts_idx = np.argmax(counts)
longest_sequence_length = counts[counts_idx] - 1
# 4
longest_sequence_idx = np.argmax(tmp == values[counts_idx])
# 2
I've implemented a run-searching function for numpy arrays in haggis.npy_util.mask2runs. You can use it like this:
runs, lengths = mask2runs(a1, return_lengths=True)
result = runs[lengths.argmax(), 0]
I have a list named x, I would like to fill the zero data with previous value, which means:
x = [x[t]=x[t-1] if x[t] == 0.0 for t in range(1,len(x)-2)]
But it displayed: SyntaxError: invalid syntax
I'm wondering where is wrong with my code? Thanks a lot.
It's your assignment x[t] = x[t-1]. Instead just use a for loop:
for t in range(1, len(x)-1):
if x[t] == 0:
x[t] = x[t-1]
Although it would probably be considered more Pythonic to use enumerate to do this:
for idx, val in enumerate(x):
if idx==0: continue # skip the first element
if val == 0:
x[idx] = x[idx-1]
# DEMO
In [1]: x = [1,0,3,0,4,0,5,0]
In [2]: for idx,val in enumerate(x):
...: if idx==0: continue
...: if val == 0:
...: x[idx] = x[idx-1]
...:
In [3]: x
Out[3]: [1, 1, 3, 3, 4, 4, 5, 5]
You could also make this work with a list comp by implementing a pairwise iterator
from itertools import tee
def pairwise(iterable):
a,b = tee(iterable)
next(b) # advance one iterator
return zip(a,b)
x = [x[0]] + [val if val else lastval for lastval,val in pairwise(x)]
We need to specifically add the first element since the pairwise iterator skips it. Alternatively we could define pairwise differently, e.g.
def pairwise(iterable):
iterable = itertools.chain([None], iterable)
a,b = itertools.tee(iterable)
next(b)
return zip(a,b)
x = [val if val else lastval for lastval,val in pairwise(x)]
# ta-da!
Here's a list comprehension to do what you require:
x = [xi if xi or i==0 else x[i-1]
for i, xi in enumerate(x)]
For full forward and backward filling (backwards in case non found before), the following will give you a filled element even if the element before it is zero:
# ∨ ∨ ∨ ∨ ∨ ∨
x = [ 0, 1, 2, 0, 3, 5, 0, 0, 0, 9, 0 ]
y = []
for i,e in enumerate(x):
if e == 0:
# search left, get first non zero
for left_e in reversed(x[:i]):
if left_e != 0:
e = left_e
break
# backward fill if all elements on the left are zeros
if e == 0:
# search right, get first non zero
for right_e in x[i+1:]:
if right_e != 0:
e = right_e
break
y.append(e)
print(y)
# [1, 1, 2, 2, 3, 5, 5, 5, 5, 9, 9]
If you want forward filling with looking only at one the previous element then Alex's answers suffice.
You can also use a simpler method next():
x = [ 0, 1, 2, 0, 3, 5, 0, 0, 0, 9, 0 ]
y = []
for i,e in enumerate(x):
if e == 0:
e = next((item for item in x[i:] if item != 0), e)
e = next((item for item in reversed(x[:i]) if item != 0), e)
y.append(e)
print(y)
# [1, 1, 2, 2, 3, 5, 5, 5, 5, 9, 9]
Q: A run is a sequence of adjacent repeated values. Given a list, write a function to
determine the length of the longest run. For example, for the sequence [1, 2, 5, 5, 3, 1, 2, 4, 3, 2, 2, 2, 2, 3, 6, 5, 5, 6, 3, 1], the longest run is 4.
I am having trouble with this, I've written a code that finds the longest run consist of the number '2' but have yet to get the length of the run which is 4.
Here is my code so far (i've commented out a part that i was working on but don't pay attention to it):
# longestrun.py
# A function to determine the length of the longest run
# A run is a sequence of adjacent repeated values.
def longestrun(myList):
result = None
prev = None
size = 0
max_size = 0
for i in myList:
if i == prev:
size += 1
if size > max_size:
result = i
max_size = size
else:
size = 0
prev = i
return result
def main():
print("This program finds the length of the longest run within a given list.")
print("A run is a sequence of adjacent repeated values.")
myString = input("Please enter a list of objects (numbers, words, etc.) separated by
commas: ")
myList = myString.split(',')
longest_run = longestrun(myList)
print(">>>", longest_run, "<<<")
main()
Help please!!! :(((
You can do this in one line using itertools.groupby:
import itertools
max(sum(1 for _ in l) for n, l in itertools.groupby(lst))
This should work if you do not want to use itertools and imports.
a=[1, 2, 5, 5, 3, 1, 2, 4, 3, 2, 2, 2, 2, 3, 6, 5, 5, 6, 3, 1]
def longestrun(myList):
result = None
prev = None
size = 0
max_size = 0
for i in myList:
if i == prev:
print (i)
size += 1
if size > max_size:
print ('******* '+ str(max_size))
max_size = size
else:
size = 0
prev = i
print (max_size+1)
return max_size+1
longestrun(a)
Just another way of doing it:
def longestrun(myList):
sett = set()
size = 1
for ind, elm in enumerate(myList):
if ind > 0:
if elm == myList[ind - 1]:
size += 1
else:
sett.update([size])
size = 1
sett.update([size])
return max(sett)
myList = [1, 2, 5, 5, 3, 1, 2, 4, 3, 2, 2, 2, 2, 3, 6, 5, 5, 6, 3, 1]
print longestrun(myList)
def getSublists(L,n):
outL=[]
for i in range(0,len(L)-n+1):
outL.append(L[i:i+n])
return outL
def longestRun(L):
for n in range(len(L), 0, -1):
temp=getSublists(L,n)
for subL in temp:
if subL==sorted(subL):
return len(subL)
def longestrun(myList):
size = 1
max_size = 0
for i in range(len(myList)-1):
if myList[i+1] = myList[i]:
size += 1
else:
size = 1
if max_size<size:
max_size = size
return size
Remove the .split() from myList in main() and you're good to go with this.
As an update to David Robinson's answer, it is now (Python 3.4) possible to return 0 on an empty sequence (instead of raising ValueError):
import itertools
max((sum(1 for _ in l) for n, l in itertools.groupby(lst)), default=0)