Randomizing a list of zeros and ones with constraints - python

I'm currently trying to randomize a list of 0s and 1s which should give a random order of zeros and ones with the following constraints:
1/3 of the items have to be 1s (respectively 2/3 are 0s)
No more than two 1s should occur consecutively
No more than four zeros should occur consecutively
I have worked on an option, but it did not exactly turn out to be what I need. Here's my option:
for prevItem, nextItem in enumerate(WordV[: -1]):
if nextItem == WordV[prevItem+1] and WordV[prevItem+1] == WordV[prevItem+2] and nextItem ==1:
WordV[prevItem+2] = 0
if nextItem == WordV[prevItem+1] and WordV[prevItem+1] == WordV[prevItem+2] and WordV[prevItem+2] == WordV[prevItem+3] and WordV[prevItem+3] == WordV[prevItem+4] and nextItem == 0:
WordV[prevItem+2] = 1
# Check the number of ones & zeros
print(WordV)
ones= WordV.count(1)
zeros= WordV.count(0)
print(ones, zeros)
Currently, the number of ones and zeros does not add up to a proportion of 1/3 to 2/3 because the constraints replace numbers. The WordV list is a list containing 24 ones and 48 zeros that is shuffled randomly (with random.shuffle(WordV)).
Is there a smarter (and more correct) way to integrate the constraints into the code?

import numpy as np
def consecutive(data, stepsize=0):
return np.split(data, np.where(np.diff(data) != stepsize)[0]+1)
def check(list_to_check):
groups = consecutive(list_to_check)
for group in groups:
if group[0] == 1 and group.size > 2:
return True
if group[0] == 0 and group.size > 4:
return True
wordv = np.array([1]*24+[0]*48)
while check(wordv):
np.random.shuffle(wordv)
wordv will contain something like:
array([0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1,
0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
0, 0, 1, 0, 1, 0])
The consecutive function will split the data in groups containing the same element:
[ins] In [32]: consecutive([1,1,1,0,0,1])
Out[32]: [array([1, 1, 1]), array([0, 0]), array([1])]
The check will check both conditions you specified and we will shuffle the list until we meet the conditions

You could try an optimization approach: Start with the list holding the elements in the right proportion, then keep swapping random elements until you get the desired results. In each turn, check the number of too-long streaks of 0s or 1s and always keep the better one of the original or the mutated list.
import itertools, random
def penalty(lst):
return sum(1 for k, g in itertools.groupby(lst)
if k == 0 and len(list(g)) > 4 or k == 1 and len(list(g)) > 2)
def constrained_shuffle(lst):
# penalty of original list
p = penalty(lst)
while p > 0:
# randomly swap two elements, get new penalty
a, b = random.randrange(len(lst)), random.randrange(len(lst))
lst[a], lst[b] = lst[b], lst[a]
p2 = penalty(lst)
if p2 > p:
# worse than before, swap back
lst[a], lst[b] = lst[b], lst[a]
else:
p = p2
lst = [0] * 20 + [1] * 10
random.shuffle(lst)
constrained_shuffle(lst)
print(lst)
For 200 0s and 100 1s this will take a few hundred to a few thousand iterations until it finds a valid list, which is okay. For lists with thousands of elements this is rather too slow, but could probably be improved by memorizing the positions of the too-long streaks and preferrably swapping elements within those.
About the "randomness" of the approach: Of course, it is less random than just repeatedly generating a new shuffled list until one fits the constraints, but I don't see how this will create a bias for or against certain lists, as long as those satisfy the constraints. I did a short test, repeatedly generating shuffled lists and counting how often each variant appears:
counts = collections.Counter()
for _ in range(10000):
lst = [0] * 10 + [1] * 5
random.shuffle(lst)
constrained_shuffle(lst)
counts[tuple(lst)] += 1
print(collections.Counter(counts.values()).most_common())
[(7, 197), (6, 168), (8, 158), (9, 157), (5, 150), (10, 98), (4, 92),
(11, 81), (12, 49), (3, 49), (13, 43), (14, 23), (2, 20), (15, 10),
(1, 8), (16, 4), (17, 3), (18, 1)]
So, yes, maybe there are a few lists that are more likely than others (one appeared 18 times, three 17 times, and most others 5-9 times). For 100,000 iterations, the "more likely" lists appear ~50% more often than the others, but still only about 120 times out of those 100,000 iterations, so I'd think that this is not too much of a problem.
Without the initial random.shuffle(lst) there are more lists what appear much more often than the average, so this should not be skipped.

I don't really know python, so I'll give you pseudocode:
int length;
int[] onesAndZeros = new int[length];
for(int i: onesAndZeros) { // generate a random list
i = random(0, 1);
}
int zeroCount() { // correct the ratio
int c;
for(int i: onesAndZeros) {
if(i == 0) {
c++;
}
}
return c;
}
int wantedZeros;
if(zeroCount() / (length - zeroCount()) != 2) { // you should probably check a small interval, but this answer is already long
int a = 2*(length - zeroCount()) - zeroCount(); // I will include the math if necessary
wantedZeros = zeroCount() + a;
}
while(zeroCount() != wantedZeros) {
boolean isLess = zeroCount < wantedZeros;
if(isLess) {
onesAndZeros[random(0, length - 1)] = 0;
} else {
onesAndZeros[random(0, length - 1)] = 0;
}
}
string isCorrect() { // fix the 2 1s and 4 0s
for(int i = 0; i < length; i++) {
if(onesAndZeros[i] == 0 &&
onesAndZeros[i + 1] == 0 &&
onesAndZeros[i + 2] == 0 &&
onesAndZeros[i + 3] == 0 &&
onesAndZeros[i + 4] == 0) { // be sure not to go out of bounds!
return "0" + i;
} else
if(onesAndZeros[i] == 1 &&
onesAndZeros[i + 1] == 1 &&
onesAndZeros[i + 2] == 1) {
return "1" + i;
} else {
return "a";
}
}
}
void fix(int type, int idx) {
if(type == 0) {
onesAndZeros[idx + 4] = 1;
} else {
onesAndZeros[idx + 2] = 0;
}
}
string corr = isCorrect();
while(length(corr) >= 2) { // note: this step will screw up the ones/zeros ratio a bit, if you want to restore it, consider running the last 2 steps again
if(corr[0] == '0') {
fix(0, toInt(removeFirstChar(corr)));
} else {
fix(1, toInt(removeFirstChar(corr)));
}
}
// done!
I'm well aware that this can be greatly optimized and cleaned up, depending on the language. But this is more of a solid base to build upon.

Related

Sorting arrays - least steps

I wonder what is the best algorithm to sort binary array with least swaps? (having array eg [0,0,0,1,0,1,0] to become [0,0,0,0,0,1,1]).
I implemented some bubble sorts but wonder what is the optimisation for those?
My code is in python, but any language is welcomed, if anyone has a solution for the least swaps that would really improve my program and i would really appreciate!!
If you really want to do it using swaps, you can start from both ends and swap the 1s you find on the left side going forward with the 0s you find on the right side going backward.
A = [0,0,0,1,0,1,0]
left1s = (i for i,b in enumerate(A) if b==1)
right0s = (len(A)-j for j,b in enumerate(reversed(A),1) if b==0)
swapCount = 0
for i,j in zip(left1s,right0s):
if i>=j:break
A[i],A[j] = A[j],A[i]
swapCount += 1
print(A) # [0, 0, 0, 0, 0, 1, 1]
print(swapCount,"swaps") # 1 swaps
Note that the same logic can be written without the use of iterators and zip:
A = [0,0,0,1,0,1,0]
swaps = 0
f,b = 0,len(A)-1 # forward and backward indexes
while f<b: # until forward meets backward
if A[f]==0: f += 1 # skip 0s forward
elif A[b]==1: b -= 1 # skip 1s backward
else: swaps,A[f],A[b] = swaps+1,A[b],A[f] # swap misplaced bits
print(A) # [0, 0, 0, 0, 0, 1, 1]
print(swaps,"swaps") # 1 swaps
If you don't want to use .sort() or similar stuff I can think of that solution:
arr = [0,0,0,1,0,1,0]
print([0] * arr.count(0) + [1] * arr.count(1))
Which ends up in [0, 0, 0, 0, 0, 1, 1]
Edit:
l = len(arr)
z = arr.count(0)
print([0]*z + [1] * (l - z))
seems to be faster with timeit.timeit
You don't need to sort a binary array, especially with python's small integer interning.
The count of ones is given by
ones = sum(lst)
The count of zeros is the length minutes that:
zeros = len(lst) - ones
You can construct the right list with
[0] * zeros + [1] * ones
Many languages have their own implementation.
Example for javascript:
[0,0,0,1,0,1,0].sort();
returns
[ 0, 0, 0, 0, 0, 1, 1 ]
Edited: adding my own implementation on javascript.
The strategy was navigate from the begin and stop when found 1, and then navigate from the end to found 0 to swap.
It's one of multiple possible implementations.
function sortBinaryArray(binArr){
let i = 0;
let j = binArr.length;
let swapCount = 0;
while(i < j){
if(binArr[i] === 0) {
// found 0 on position i. Its sorted until now.
i++;
continue;
}
// Found 1 on posittion i. Search from the end for 0 to swap
j--;
while(i < j){
if (binArr[j] === 0) {
// Found position to swap
binArr[i] = 0;
binArr[j] = 1;
i++;
swapCount++;
break;
}
j--
}
}
console.log('swapCount='+swapCount);
}
var myArr = [0,0,0,1,0,1,0];
sortBinaryArray(myArr);
console.log(myArr)
output:
swapCount=1
Array(7) [ 0, 0, 0, 0, 0, 1, 1 ]

Maximum sum of subsequence of length L with a restriction

Given an array of positive integers. How to find a subsequence of length L with max sum which has the distance between any two of its neighboring elements that do not exceed K
I have the following solution but don't know how to take into account length L.
1 <= N <= 100000, 1 <= L <= 200, 1 <= K <= N
f[i] contains max sum of the subsequence that ends in i.
for i in range(K, N)
f[i] = INT_MIN
for j in range(1, K+1)
f[i] = max(f[i], f[i-j] + a[i])
return max(f)
(edit: slightly simplified non-recursive solution)
You can do it like this, just for each iteration consider if the item should be included or excluded.
def f(maxK,K, N, L, S):
if L == 0 or not N or K == 0:
return S
#either element is included
included = f(maxK,maxK, N[1:], L-1, S + N[0] )
#or excluded
excluded = f(maxK,K-1, N[1:], L, S )
return max(included, excluded)
assert f(2,2,[10,1,1,1,1,10],3,0) == 12
assert f(3,3,[8, 3, 7, 6, 2, 1, 9, 2, 5, 4],4,0) == 30
If N is very long you can consider changing to a table version, you could also change the input to tuples and use memoization.
Since OP later included the information that N can be 100 000, we can't really use recursive solutions like this. So here is a solution that runs in O(nKL), with same memory requirement:
import numpy as np
def f(n,K,L):
t = np.zeros((len(n),L+1))
for l in range(1,L+1):
for i in range(len(n)):
t[i,l] = n[i] + max( (t[i-k,l-1] for k in range(1,K+1) if i-k >= 0), default = 0 )
return np.max(t)
assert f([10,1,1,1,1,10],2,3) == 12
assert f([8, 3, 7, 6, 2, 1, 9],3,4) == 30
Explanation of the non recursive solution. Each cell in the table t[ i, l ] expresses the value of max subsequence with exactly l elements that use the element in position i and only elements in position i or lower where elements have at most K distance between each other.
subsequences of length n (those in t[i,1] have to have only one element, n[i] )
Longer subsequences have the n[i] + a subsequence of l-1 elements that starts at most k rows earlier, we pick the one with the maximal value. By iterating this way, we ensure that this value is already calculated.
Further improvements in memory is possible by considering that you only look at most K steps back.
Here is a bottom up (ie no recursion) dynamic solution in Python. It takes memory O(l * n) and time O(l * n * k).
def max_subseq_sum(k, l, values):
# table[i][j] will be the highest value from a sequence of length j
# ending at position i
table = []
for i in range(len(values)):
# We have no sum from 0, and i from len 1.
table.append([0, values[i]])
# By length of previous subsequence
for subseq_len in range(1, l):
# We look back up to k for the best.
prev_val = None
for last_i in range(i-k, i):
# We don't look back if the sequence was not that long.
if subseq_len <= last_i+1:
# Is this better?
this_val = table[last_i][subseq_len]
if prev_val is None or prev_val < this_val:
prev_val = this_val
# Do we have a best to offer?
if prev_val is not None:
table[i].append(prev_val + values[i])
# Now we look for the best entry of length l.
best_val = None
for row in table:
# If the row has entries for 0...l will have len > l.
if l < len(row):
if best_val is None or best_val < row[l]:
best_val = row[l]
return best_val
print(max_subseq_sum(2, 3, [10, 1, 1, 1, 1, 10]))
print(max_subseq_sum(3, 4, [8, 3, 7, 6, 2, 1, 9, 2, 5, 4]))
If I wanted to be slightly clever I could make this memory O(n) pretty easily by calculating one layer at a time, throwing away the previous one. It takes a lot of cleverness to reduce running time to O(l*n*log(k)) but that is doable. (Use a priority queue for your best value in the last k. It is O(log(k)) to update it for each element but naturally grows. Every k values you throw it away and rebuild it for a O(k) cost incurred O(n/k) times for a total O(n) rebuild cost.)
And here is the clever version. Memory O(n). Time O(n*l*log(k)) worst case, and average case is O(n*l). You hit the worst case when it is sorted in ascending order.
import heapq
def max_subseq_sum(k, l, values):
count = 0
prev_best = [0 for _ in values]
# i represents how many in prev subsequences
# It ranges from 0..(l-1).
for i in range(l):
# We are building subsequences of length i+1.
# We will have no way to find one that ends
# before the i'th element at position i-1
best = [None for _ in range(i)]
# Our heap will be (-sum, index). It is a min_heap so the
# minimum element has the largest sum. We track the index
# so that we know when it is in the last k.
min_heap = [(-prev_best[i-1], i-1)]
for j in range(i, len(values)):
# Remove best elements that are more than k back.
while min_heap[0][-1] < j-k:
heapq.heappop(min_heap)
# We append this value + (best prev sum) using -(-..) = +.
best.append(values[j] - min_heap[0][0])
heapq.heappush(min_heap, (-prev_best[j], j))
# And now keep min_heap from growing too big.
if 2*k < len(min_heap):
# Filter out elements too far back.
min_heap = [_ for _ in min_heap if j - k < _[1]]
# And make into a heap again.
heapq.heapify(min_heap)
# And now finish this layer.
prev_best = best
return max(prev_best)
Extending the code for itertools.combinations shown at the docs, I built a version that includes an argument for the maximum index distance (K) between two values. It only needed an additional and indices[i] - indices[i-1] < K check in the iteration:
def combinations_with_max_dist(iterable, r, K):
# combinations('ABCD', 2) --> AB AC AD BC BD CD
# combinations(range(4), 3) --> 012 013 023 123
pool = tuple(iterable)
n = len(pool)
if r > n:
return
indices = list(range(r))
yield tuple(pool[i] for i in indices)
while True:
for i in reversed(range(r)):
if indices[i] != i + n - r and indices[i] - indices[i-1] < K:
break
else:
return
indices[i] += 1
for j in range(i+1, r):
indices[j] = indices[j-1] + 1
yield tuple(pool[i] for i in indices)
Using this you can bruteforce over all combinations with regards to K, and then find the one that has the maximum value sum:
def find_subseq(a, L, K):
return max((sum(values), values) for values in combinations_with_max_dist(a, L, K))
Results:
print(*find_subseq([10, 1, 1, 1, 1, 10], L=3, K=2))
# 12 (10, 1, 1)
print(*find_subseq([8, 3, 7, 6, 2, 1, 9, 2, 5, 4], L=4, K=3))
# 30 (8, 7, 6, 9)
Not sure about the performance if your value lists become very long though...
Algorithm
Basic idea:
Iteration on input array, choose each index as the first taken element.
Then Recursion on each first taken element, mark the index as firstIdx.
The next possible index would be in range [firstIdx + 1, firstIdx + K], both inclusive.
Loop on the range to call each index recursively, with L - 1 as the new L.
Optionally, for each pair of (firstIndex, L), cache its max sum, for reuse.
Maybe this is necessary for large input.
Constraints:
array length <= 1 << 17 // 131072
K <= 1 << 6 // 64
L <= 1 << 8 // 256
Complexity:
Time: O(n * L * K)
Since each (firstIdx , L) pair only calculated once, and that contains a iteration of K.
Space: O(n * L)
For cache, and method stack in recursive call.
Tips:
Depth of recursion is related to L, not array length.
The defined constraints are not the actual limit, it could be larger, though I didn't test how large it can be.
Basically:
Both array length and K actually could be of any size as long as there are enough memory, since they are handled via iteration.
L is handled via recursion, thus it does has a limit.
Code - in Java
SubSumLimitedDistance.java:
import java.util.HashMap;
import java.util.Map;
public class SubSumLimitedDistance {
public static final long NOT_ENOUGH_ELE = -1; // sum that indicate not enough element, should be < 0,
public static final int MAX_ARR_LEN = 1 << 17; // max length of input array,
public static final int MAX_K = 1 << 6; // max K, should not be too long, otherwise slow,
public static final int MAX_L = 1 << 8; // max L, should not be too long, otherwise stackoverflow,
/**
* Find max sum of sum array.
*
* #param arr
* #param K
* #param L
* #return max sum,
*/
public static long find(int[] arr, int K, int L) {
if (K < 1 || K > MAX_K)
throw new IllegalArgumentException("K should be between [1, " + MAX_K + "], but get: " + K);
if (L < 0 || L > MAX_L)
throw new IllegalArgumentException("L should be between [0, " + MAX_L + "], but get: " + L);
if (arr.length > MAX_ARR_LEN)
throw new IllegalArgumentException("input array length should <= " + MAX_ARR_LEN + ", but get: " + arr.length);
Map<Integer, Map<Integer, Long>> cache = new HashMap<>(); // cache,
long maxSum = NOT_ENOUGH_ELE;
for (int i = 0; i < arr.length; i++) {
long sum = findTakeFirst(arr, K, L, i, cache);
if (sum == NOT_ENOUGH_ELE) break; // not enough elements,
if (sum > maxSum) maxSum = sum; // larger found,
}
return maxSum;
}
/**
* Find max sum of sum array, with index of first taken element specified,
*
* #param arr
* #param K
* #param L
* #param firstIdx index of first taken element,
* #param cache
* #return max sum,
*/
private static long findTakeFirst(int[] arr, int K, int L, int firstIdx, Map<Integer, Map<Integer, Long>> cache) {
// System.out.printf("findTakeFirst(): K = %d, L = %d, firstIdx = %d\n", K, L, firstIdx);
if (L == 0) return 0; // done,
if (firstIdx + L > arr.length) return NOT_ENOUGH_ELE; // not enough elements,
// check cache,
Map<Integer, Long> map = cache.get(firstIdx);
Long cachedResult;
if (map != null && (cachedResult = map.get(L)) != null) {
// System.out.printf("hit cache, cached result = %d\n", cachedResult);
return cachedResult;
}
// cache not exists, calculate,
long maxRemainSum = NOT_ENOUGH_ELE;
for (int i = firstIdx + 1; i <= firstIdx + K; i++) {
long remainSum = findTakeFirst(arr, K, L - 1, i, cache);
if (remainSum == NOT_ENOUGH_ELE) break; // not enough elements,
if (remainSum > maxRemainSum) maxRemainSum = remainSum;
}
if ((map = cache.get(firstIdx)) == null) cache.put(firstIdx, map = new HashMap<>());
if (maxRemainSum == NOT_ENOUGH_ELE) { // not enough elements,
map.put(L, NOT_ENOUGH_ELE); // cache - as not enough elements,
return NOT_ENOUGH_ELE;
}
long maxSum = arr[firstIdx] + maxRemainSum; // max sum,
map.put(L, maxSum); // cache - max sum,
return maxSum;
}
}
SubSumLimitedDistanceTest.java:
(test case, via TestNG)
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.util.concurrent.ThreadLocalRandom;
public class SubSumLimitedDistanceTest {
private int[] arr;
private int K;
private int L;
private int maxSum;
private int[] arr2;
private int K2;
private int L2;
private int maxSum2;
private int[] arrMax;
private int KMax;
private int KMaxLargest;
private int LMax;
private int LMaxLargest;
#BeforeClass
private void setUp() {
// init - arr,
arr = new int[]{10, 1, 1, 1, 1, 10};
K = 2;
L = 3;
maxSum = 12;
// init - arr2,
arr2 = new int[]{8, 3, 7, 6, 2, 1, 9, 2, 5, 4};
K2 = 3;
L2 = 4;
maxSum2 = 30;
// init - arrMax,
arrMax = new int[SubSumLimitedDistance.MAX_ARR_LEN];
ThreadLocalRandom rd = ThreadLocalRandom.current();
long maxLongEle = Long.MAX_VALUE / SubSumLimitedDistance.MAX_ARR_LEN;
int maxEle = maxLongEle > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) maxLongEle;
for (int i = 0; i < arrMax.length; i++) {
arrMax[i] = rd.nextInt(maxEle);
}
KMax = 5;
LMax = 10;
KMaxLargest = SubSumLimitedDistance.MAX_K;
LMaxLargest = SubSumLimitedDistance.MAX_L;
}
#Test
public void test() {
Assert.assertEquals(SubSumLimitedDistance.find(arr, K, L), maxSum);
Assert.assertEquals(SubSumLimitedDistance.find(arr2, K2, L2), maxSum2);
}
#Test(timeOut = 6000)
public void test_veryLargeArray() {
run_printDuring(arrMax, KMax, LMax);
}
#Test(timeOut = 60000) // takes seconds,
public void test_veryLargeArrayL() {
run_printDuring(arrMax, KMax, LMaxLargest);
}
#Test(timeOut = 60000) // takes seconds,
public void test_veryLargeArrayK() {
run_printDuring(arrMax, KMaxLargest, LMax);
}
// run find once, and print during,
private void run_printDuring(int[] arr, int K, int L) {
long startTime = System.currentTimeMillis();
long sum = SubSumLimitedDistance.find(arr, K, L);
long during = System.currentTimeMillis() - startTime; // during in milliseconds,
System.out.printf("arr length = %5d, K = %3d, L = %4d, max sum = %15d, running time = %.3f seconds\n", arr.length, K, L, sum, during / 1000.0);
}
#Test
public void test_corner_notEnoughEle() {
Assert.assertEquals(SubSumLimitedDistance.find(new int[]{1}, 2, 3), SubSumLimitedDistance.NOT_ENOUGH_ELE); // not enough element,
Assert.assertEquals(SubSumLimitedDistance.find(new int[]{0}, 1, 3), SubSumLimitedDistance.NOT_ENOUGH_ELE); // not enough element,
}
#Test
public void test_corner_ZeroL() {
Assert.assertEquals(SubSumLimitedDistance.find(new int[]{1, 2, 3}, 2, 0), 0); // L = 0,
Assert.assertEquals(SubSumLimitedDistance.find(new int[]{0}, 1, 0), 0); // L = 0,
}
#Test(expectedExceptions = IllegalArgumentException.class)
public void test_invalid_K() {
// SubSumLimitedDistance.find(new int[]{1, 2, 3}, 0, 2); // K = 0,
// SubSumLimitedDistance.find(new int[]{1, 2, 3}, -1, 2); // K = -1,
SubSumLimitedDistance.find(new int[]{1, 2, 3}, SubSumLimitedDistance.MAX_K + 1, 2); // K = SubSumLimitedDistance.MAX_K+1,
}
#Test(expectedExceptions = IllegalArgumentException.class)
public void test_invalid_L() {
// SubSumLimitedDistance.find(new int[]{1, 2, 3}, 2, -1); // L = -1,
SubSumLimitedDistance.find(new int[]{1, 2, 3}, 2, SubSumLimitedDistance.MAX_L + 1); // L = SubSumLimitedDistance.MAX_L+1,
}
#Test(expectedExceptions = IllegalArgumentException.class)
public void test_invalid_tooLong() {
SubSumLimitedDistance.find(new int[SubSumLimitedDistance.MAX_ARR_LEN + 1], 2, 3); // input array too long,
}
}
Output of test case for large input:
arr length = 131072, K = 5, L = 10, max sum = 20779205738, running time = 0.303 seconds
arr length = 131072, K = 64, L = 10, max sum = 21393422854, running time = 1.917 seconds
arr length = 131072, K = 5, L = 256, max sum = 461698553839, running time = 9.474 seconds

Integer list to ranges

I need to convert a list of ints to a string containing all the ranges in the list.
So for example, the output should be as follows:
getIntRangesFromList([1,3,7,2,11,8,9,11,12,15]) -> "1-3,7-9,11-12,15"
So the input is not sorted and there can be duplicate values. The lists range in size from one element to 4k elements. The minimum and maximum values are 1 and 4094.
This is part of a performance critical piece of code. I have been trying to optimize this, but I can't find a way to get this faster. This is my current code:
def _getIntRangesFromList(list):
if (list==[]):
return ''
list.sort()
ranges = [[list[0],list[0]]] # ranges contains the start and end values of each range found
for val in list:
r = ranges[-1]
if val==r[1]+1:
r[1] = val
elif val>r[1]+1:
ranges.append([val,val])
return ",".join(["-".join([str(y) for y in x]) if x[0]!=x[1] else str(x[0]) for x in ranges])
Any idea on how to get this faster?
This could be a task for the itertools module.
import itertools
list_num = [1, 2, 3, 7, 8, 9, 11, 12, 15]
groups = (list(x) for _, x in
itertools.groupby(list_num, lambda x, c=itertools.count(): x - next(c)))
print(', '.join('-'.join(map(str, (item[0], item[-1])[:len(item)])) for item in groups))
This will give you 1-3, 7-9, 11-12, 15.
To understand what's going on you might want to check the content of groups.
import itertools
list_num = [1, 2, 3, 7, 8, 9, 11, 12, 15]
groups = (list(x) for _, x in
itertools.groupby(list_num, lambda x, c=itertools.count(): x - next(c)))
for element in groups:
print('element={}'.format(element))
This will give you the following output.
element=[1, 2, 3]
element=[7, 8, 9]
element=[11, 12]
element=[15]
The basic idea is to have a counter running parallel to the numbers. groupby will create individual groups for numbers with the same numerical distance to the current value of the counter.
I don't know whether this is faster on your version of Python. You'll have to check this yourself. In my setting it's slower with this data set, but faster with a bigger number of elements.
The fastest one I could come up, which tests about 10% faster than your solution on my machine (according to timeit):
def _ranges(l):
if l:
l.sort()
return ''.join([(str(l[i]) + ('-' if l[i] + 1 == l[i + 1] else ','))
for i in range(0, len(l) - 1) if l[i - 1] + 2 != l[i + 1]] +
[str(l[-1])])
else: return ''
The above code assumes that the values in the list are unique. If they aren't, it's easy to fix but there's a subtle hack which will no longer work and the end result will be slightly slower.
I actually timed _ranges(u[:]) because of the sort; u is 600 randomly selected integers from range(1000) comprising 235 subsequences; 83 are singletons and 152 contain at least two numbers. If the list is sorted, quite a lot of time is saved.
def _to_range(l, start, stop, idx, result):
if idx == len(l):
result.append((start, stop))
return result
if l[idx] - stop > 1:
result.append((start, stop))
return _to_range(l, l[idx], l[idx], idx + 1, result)
return _to_range(l, start, l[idx], idx + 1, result)
def get_range(l):
if not l:
return []
return _to_range(l, start = l[0], stop = l[0], idx = 0, result = [])
l = [1, 2, 3, 7, 8, 9, 11, 12, 15]
result = get_range(l)
print(result)
>>> [(1, 3), (7, 9), (11, 12), (15, 15)]
# I think it's better to fetch the data as it is and if needed, change it
# with
print(','.join('-'.join([str(start), str(stop)]) for start, stop in result))
>>> 1-3,7-9,11-12,15-15
Unless you don't care at all about the data, then u can just append str(start) + '-' + str(stop) in _to_range function so later there will be no need to type extra '-'.join method.
I'll concentrate on the performance that is your main issue. I'll give 2 solutions:
1) If the boundaries of the integers stored is between A and B, and you can create an array of booleans(even you can choose an array of bits for expanding the range you can storage) with (B - A + 2) elements, e.g. A = 0 and B = 1 000 000, we can do this (i'll write it in C#, sorry XD). This run in O(A - B) and is a good solution if A - B is less than the number of numbers:
public string getIntRangesFromList(int[] numbers)
{
//You can change this 2 constants
const int A = 0;
const int B = 1000000;
//Create an array with all its values in false by default
//Last value always will be in false in propourse, as you can see it storage 1 value more than needed for 2nd cycle
bool[] apparitions = new bool[B - A + 2];
int minNumber = B + 1;
int maxNumber = A - 1;
int pos;
for (int i = 0; i < numbers.Length; i++)
{
pos = numbers[i] - A;
apparitions[pos] = true;
if (minNumber > pos)
{
minNumber = pos;
}
if (maxNumber < pos)
{
maxNumber = pos;
}
}
//I will mantain the concatenation simple, but you can make it faster to improve performance
string result = "";
bool isInRange = false;
bool isFirstRange = true;
int firstPosOfRange = 0; //Irrelevant what is its initial value
for (int i = minNumber; i <= maxNumber + 1; i++)
{
if (!isInRange)
{
if (apparitions[i])
{
if (!isFirstRange)
{
result += ",";
}
else
{
isFirstRange = false;
}
result += (i + A);
isInRange = true;
firstPosOfRange = i;
}
}
else
{
if (!apparitions[i])
{
if (i > firstPosOfRange + 1)
{
result += "-" + (i + A - 1);
}
isInRange = false;
}
}
}
return result;
}
2) O(N * log N)
public string getIntRangesFromList2(int[] numbers)
{
string result = "";
if (numbers.Length > 0)
{
numbers.OrderBy(x => x); //sorting and making the algorithm complexity O(N * log N)
result += numbers[0];
int countNumbersInRange = 1;
for (int i = 1; i < numbers.Length; i++)
{
if (numbers[i] != numbers[i - 1] + 1)
{
if (countNumbersInRange > 1)
{
result += "-" + numbers[i - 1];
}
result += "," + numbers[i];
countNumbersInRange = 1;
}
else
{
countNumbersInRange++;
}
}
}
return result;
}

Find even multiples of a number in a list using Python

I am searching for the most pythonic way to check, whether one or more elements in a list are even multiples of a predefined number with a predefined tolerance. An example is given below:
myNumber=3.5
myList=[0,0.5,1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5]
myTolerance=0.5
myResult=[0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0]
Any suggestions are very appreciated.
How about
from numpy import mod, floor
withinTolAbove=[int(mod(number, myNumber) <= myTolerance and
mod(floor(number / myNumber), 2) == 0) for number in myList]
withinTolBelow=[int(mod(number + myTolerance, myNumber) <= myTolerance and
mod(floor((number + myTolerance) / myNumber), 2) == 0) for number in myList]
myResult=[max(i1, i2) * int(number > myTolerance) for i1, i2, number in zip(withinTolAbove, withinTolBelow, myList)]
The first part determines if the division is within the tolerance of an integer and the second part figures out if this integer is divisible by 2.
How about
print print([int(abs(x - round(x / myNumber) * myNumber) <= myTolerance and round(x / myNumber) > 1.5) for x in myList])
Output:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0]

Round with integer division

Is there is a simple, pythonic way of rounding to the nearest whole number without using floating point? I'd like to do the following but with integer arithmetic:
skip = int(round(1.0 * total / surplus))
==============
#John: Floating point is not reproducible across platforms. If you want your code to pass tests across different platforms then you need to avoid floating point (or add some hacky espilon stuff to your tests and hope it works). The above may be simple enough that it would be the same on most/all platforms, but I'd rather not make that determination as it is easier to avoid floating point altogether. How is that "not in the spirit of Python"?
You can do this quite simply:
(n + d // 2) // d, where n is the dividend and d is the divisor.
Alternatives like (((n << 1) // d) + 1) >> 1 or the equivalent (((n * 2) // d) + 1) // 2 may be SLOWER in recent CPythons, where an int is implemented like the old long.
The simple method does 3 variable accesses, 1 constant load, and 3 integer operations. The complicated methods do 2 variable accesses, 3 constant loads, and 4 integer operations. Integer operations are likely to take time which depends on the sizes of the numbers involved. Variable accesses of function locals don't involve "lookups".
If you are really desparate for speed, do benchmarks. Otherwise, KISS.
skip = (((total << 1) // surplus) + 1) >> 1
Shifting things left by one bit effectively multiplies by two, shifting things right by one bit divides by two rounding down. Adding one in the middle makes it so that "rounding down" is actually rounding up if the result would have been above a .5 decimal part.
It's basically the same as if you wrote...
skip = int((1.0*total/surplus) + 0.5)
except with everything multplied by 2, and then later divided by 2, which is something you can do with integer arithmetic (since bit shifts don't require floating point).
TL;DR:
q, r = divmod(total, surplus)
skip = q if q % 2 == 0 and 2*r == surplus else q + (2*r // surplus) # rounds to nearest integer; half to nearest even
This solution:
rounds to the nearest integer as demanded by OP
works for both positive and negative integers
rounds 0.5 fractional parts to nearest even integer (rnd(0.5)=0, rnd(1.5)=2) which matches the behavior of python's round function used by OP
sticks to integer arithmetic as demanded by OP (see documentation of divmod)
Full story
Inspired by zhmyh's answer answer, which is
q, r = divmod(total, surplus)
skip = q + int(bool(r)) # rounds to next greater integer (always ceiling)
, I came up with the following solution (UPDATE: that only works for non-negative total and surplus, as pointed out in the comments):
q, r = divmod(total, surplus)
skip = q + int(2 * r >= surplus) # rounds to nearest integer (floor or ceiling) if total and surplus are non-negative
Since the OP asked for rounding to the nearest whole number, zhmhs's solution is in fact slightly incorrect, because it always rounds to the next greater whole number, while my solution works as demanded works for non-negative total and surplus only. A correct solution that also works for negative numbers is:
q, r = divmod(total, surplus)
skip = q + (2 * r // surplus) # rounds to nearest integer (positive: half away from zero, negative: half toward zero)
Please note though that if 2 * r == surplus, it will basically perform ceiling for both positive and negative results, e.g. ceil(-1.5) == -1 whereas ceil(1.5) == 2. This is still correct behavior in terms of rounding to the nearest integer (because of equal distance to next lower and next greater integer) but it is asymmetrical in reference to zero. To also fix this, that is, round half away from zero, we can add a boolean condition:
q, r = divmod(total, surplus)
skip = q if q < 0 and 2*r == surplus else q + (2*r // surplus) # rounds to nearest integer; half away from zero
And even better, to round half to nearest even like python's round function used by the OP:
q, r = divmod(total, surplus)
skip = q if q % 2 == 0 and 2*r == surplus else q + (2*r // surplus) # rounds to nearest integer; half to nearest even
In case you wonder how divmod is defined: According to its documentation
For integers, the result is the same as (a // b, a % b).
We therefore stick with integer arithmetic, as demanded by the OP.
Yet another funny way:
q, r = divmod(total, surplus)
skip = q + int(bool(r))
Simply take care of the rounding rule before you ever divide. For the simplest round-half-up:
if total % surplus < surplus / 2:
return total / surplus
else:
return (total / surplus) + 1
Tweak a little bit if you need to do a proper round-to-even.
The question is the rounding strategy you are after.
Here are several that I came up with - note that integer floor and ceiling are cases, but in "round to nearest" there are many strategies. The IEEE 754 and the most professional and industrial and even elegant way of doing it is Round To Nearest Even. I've never seen a single example anywhere of doing this for integer arithmetic. You cannot go through float as the 53-bit mantissa may cause precision issues and it already applies round-to-nearest-even mode if desiring to do a different rounding strategy. The correct techniques should always stay in the integer domain
def roundNegInf(n, d): #floor
return n // d
def roundPosInf(n, d): #ceil
return (n + d + -1*(d >= 0)) // d
def roundTowardsZero(n, d):
return (n + ((d + -1*(d >=0)) if (n < 0) ^ (d < 0) else 0)) // d
def roundAwayFromZero(n, d):
return (n + (0 if (n < 0) ^ (d < 0) else (d + -1*(d >= 0)))) // d
def roundNearestPosInf(n, d):
#return (((n << 1) // d) + 1) >> 1
#return (((n * 2) // d) + 1) // 2
q, r = divmod(n, d)
return q + (2 * r <= d if d < 0 else 2 * r >= d)
def roundNearestNegInf(n, d):
q, r = divmod(n, d)
return q + (2 * r < d if d < 0 else 2 * r > d)
def roundNearestEven(n, d): #round only when for positive numbers guard, round, sticky are 11X or 1X1
q, r = divmod(n, d)
return q + (q & 1) * (2 * r == d) + ((2 * r < d) if d < 0 else (2 * r > d))
def roundNearestToZero(n, d):
q, r = divmod(n, d)
return q + (q < 0) * (2 * r == d) + ((2 * r < d) if d < 0 else (2 * r > d))
def roundNearestAwayZero(n, d):
q, r = divmod(n, d)
return q + (q >= 0) * (2 * r == d) + ((2 * r < d) if d < 0 else (2 * r > d))
def testRounding():
pairs = ((1, 2), (-1, 2), (1, -2), (-1, -2), (3, 2), (-3, 2), (3, -2), (-3, -2),
(1, 3), (-1, 3), (1, -3), (-1, -3), (2, 3), (-2, 3), (2, -3), (-2, -3))
funcs = (roundNegInf, roundPosInf, roundTowardsZero, roundAwayFromZero,
roundNearestPosInf, roundNearestNegInf,
roundNearestToZero, roundNearestAwayZero, roundNearestEven)
res = [[f(*p) for p in pairs] for f in funcs]
expected = [[0, -1, -1, 0, 1, -2, -2, 1, 0, -1, -1, 0, 0, -1, -1, 0],
[1, 0, 0, 1, 2, -1, -1, 2, 1, 0, 0, 1, 1, 0, 0, 1],
[0, 0, 0, 0, 1, -1, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[1, -1, -1, 1, 2, -2, -2, 2, 1, -1, -1, 1, 1, -1, -1, 1],
[1, 0, 0, 1, 2, -1, -1, 2, 0, 0, 0, 0, 1, -1, -1, 1],
[0, -1, -1, 0, 1, -2, -2, 1, 0, 0, 0, 0, 1, -1, -1, 1],
[0, 0, 0, 0, 1, -1, -1, 1, 0, 0, 0, 0, 1, -1, -1, 1],
[1, -1, -1, 1, 2, -2, -2, 2, 0, 0, 0, 0, 1, -1, -1, 1],
[0, 0, 0, 0, 2, -2, -2, 2, 0, 0, 0, 0, 1, -1, -1, 1]
]
assert(all([x == y for x, y in zip(res, expected)]))
This should work too:
def rint(n):
return (int(n+.5) if n > 0 else int(n-.5))

Categories

Resources