Replacing for loop with list comprehension - python

PROBLEM: I was wondering if it is possible to replace this for loop with a list comprehension. Trying to learn idiomatic Python.
STATEMENT:
Take row #0 of X and c
Multiply corresponding elements (you get 5 numbers for 5 columns)
Sum these 5 numbers
Subtract it from y[0]
square it for the final result
Repeat for all rows of X, c and columns of y
Find the index of c with the smallest result in step 5
import numpy as np
X = np.array([[66, 5, 15, 2, 500],
[21, 3, 50, 1, 100],
[120, 15, 5, 2, 1200]])
y = np.array([250000, 60000, 525000])
c = np.array([[3000, 200 , -50, 5000, 100],
[2000, -250, -100, 150, 250],
[3000, -100, -150, 0, 150]])
def find_best(X, y, c):
smallest_error = np.Inf
best_index = -1
idx = 0
sq_error_list = []
for coeff in c:
sq_error = sum((y - (X # c[idx]))**2)
sq_error_list.append(sq_error)
idx += 1
best_index = sq_error_list.index(min(sq_error_list))
print("the best set is set %d" % best_index)
find_best(X, y, c)

You can solve it with this
np.argmin(np.sum((y[:,None]-X # c.T)**2,axis=0))

Related

find index of n consecutive values greater than zero with the largest sum from a numpy array (or pandas Series)

So here is my problem: I have an array like this:
arr = array([0, 0, 1, 8, 10, 20, 26, 32, 37, 52, 0, 0, 46, 42, 30, 19, 8, 2, 0, 0, 0])
In this array I want to find n consecutive values, greater than zero with the biggest sum. In this example with n = 5 this would be array([20, 26, 32, 37, 52]) and the index would be 5.
What I tried is of course a loop:
n = 5
max_sum = 0
max_loc = 0
for i in range(arr.size - n):
if all(arr[i:i + n] > 0) and arr[i:i + n].sum() > max_sum:
max_sum = arr[i:i + n].sum()
max_loc = i
print(max_loc)
This is fine for not too many short arrays but of course I need to use this on many not so short arrays.
I was experimenting with numpy so I would only have to iterate non-zero value groups:
diffs = np.concatenate((np.array([False]), np.diff(arr > 0)))
groups = np.split(arr, np.where(diffs)[0])
for group in groups:
if group.sum() > 0 and group.size >= n:
...
but I believe this is nice but not the right direction. I am looking for a simpler and faster numpy / pandas solution that really uses the powers of these packages.
Using cross-correlation, numpy.correlate, is a possible, concise and fast solution:
n=5
arr[arr<0] = np.iinfo(arr.dtype).min # The greatest negative integer possible
#Thanks for the np.iinfo suggestion, #Corralien
idx = np.argmax(np.correlate(arr, np.ones(n), 'valid'))
idx, arr[idx:(idx+5)]
Another possible solution:
n, l = 5, arr.size
arr[arr<0] = np.iinfo(arr.dtype).min # The greatest negative integer possible
#Thanks for the np.iinfo suggestion, #Corralien
idx = np.argmax([np.sum(np.roll(arr,-x)[:n]) for x in range(l-n+1)])
idx, arr[idx:(idx+n)]
Output:
(5, array([20, 26, 32, 37, 52]))
You can use sliding_window_view:
from numpy.lib.stride_tricks import sliding_window_view
N = 5
win = sliding_window_view(arr, N)
idx = ((win.sum(axis=1)) * ((win>0).all(axis=1))).argmax()
print(idx, arr[idx:idx+N])
# Output
5 [20 26 32 37 52]
Answer greatly enhanced by chrslg to save memory and keep a win as a view.
Update
A nice bonus is this should work with Pandas Series just fine.
N = 5
idx = pd.Series(arr).where(lambda x: x > 0).rolling(N).sum().shift(-N+1).idxmax()
print(idx, arr[idx:idx+N])
# Output
5 [20 26 32 37 52]

Reading a text file (tab/space delimited) having named columns into lists with the lists having the same name as the column name

My text file looks like this:
x y z D
0 0 350 10
50 -50 400 15
100 50 450 10
-25 100 500 10
where the columns are tab-separated. I want to import it into 4 Python lists having the name of the columns:
x = [0, 50, 100, -25]
y = [0, -50, 50, 100]
z = [350, 400, 450, 500]
D = [10, 15, 10, 10]
Is it possible to do such using some in-built functions without resorting to importing Pandas or some special packages?
you could do this:
import re
with open('file.txt') as f:
data = [re.split('[ ]+|\t',x) for x in f.read().split('\n')]
res = dict((x,[]) for x in data[0])
for i in data[1:]:
for j in range(len(i)):
res[data[0][j]].append(i[j])
print(res)
I suggest this approach...
Construct a dictionary keyed on the column names (x, y, z, D)
Each key has a value which is a list.
Consume the file appending the individual values to the appropriate keys.
from collections import defaultdict
with open('t.txt') as infile:
cols = next(infile).strip().split()
d = defaultdict(list)
for line in infile:
for i, t in enumerate(line.strip().split()):
d[cols[i]].append(int(t))
for k, v in d.items():
print(f'{k} = {v}')
Output:
x = [0, 50, 100, -25]
y = [0, -50, 50, 100]
z = [350, 400, 450, 500]
D = [10, 15, 10, 10]

Count the number of times values appear within a range of values

How do I output a list which counts and displays the number of times different values fit into a range?
Based on the below example, the output would be x = [0, 3, 2, 1, 0] as there are 3 Pro scores (11, 24, 44), 2 Champion scores (101, 888), and 1 King score (1234).
- P1 = 11
- P2 = 24
- P3 = 44
- P4 = 101
- P5 = 1234
- P6 = 888
totalsales = [11, 24, 44, 101, 1234, 888]
Here is ranking corresponding to the sales :
Sales___________________Ranking
0-10____________________Noob
11-100__________________Pro
101-1000________________Champion
1001-10000______________King
100001 - 200000__________Lord
This is one way, assuming your values are integers and ranges do not overlap.
from collections import Counter
# Ranges go to end + 1
score_ranges = [
range(0, 11), # Noob
range(11, 101), # Pro
range(101, 1001), # Champion
range(1001, 10001), # King
range(10001, 200001) # Lord
]
total_sales = [11, 24, 44, 101, 1234, 888]
# This counter counts how many values fall into each score range (by index).
# It works by taking the index of the first range containing each value (or -1 if none found).
c = Counter(next((i for i, r in enumerate(score_ranges) if s in r), -1) for s in total_sales)
# This converts the above counter into a list, taking the count for each index.
result = [c[i] for i in range(len(score_ranges))]
print(result)
# [0, 3, 2, 1, 0]
As a general rule homework should not be posted on stackoverflow. As such, just a pointer on how to solve this, implementation is up to you.
Iterate over the totalsales list and check if each number is in range(start,stop). Then for each matching check increment one per category in your result list (however using a dict to store the result might be more apt).
Here a possible solution with no use of modules such as numpy or collections:
totalsales = [11, 24, 44, 101, 1234, 888]
bins = [10, 100, 1000, 10000, 20000]
output = [0]*len(bins)
for s in totalsales:
slot = next(i for i, x in enumerate(bins) if s <= x)
output[slot] += 1
output
>>> [0, 3, 2, 1, 0]
If your sales-to-ranking mapping always follows a logarithmic curve, the desired output can be calculated in linear time using math.log10 with collections.Counter. Use an offset of 0.5 and the abs function to handle sales of 0 and 1:
from collections import Counter
from math import log10
counts = Counter(int(abs(log10(abs(s - .5)))) for s in totalsales)
[counts.get(i, 0) for i in range(5)]
This returns:
[0, 3, 2, 1, 0]
Here, I have used the power of dataframe to store the values, then using bin and cut to group the values into the right categories. The extracting the value count into list.
Let me know if it is okay.
import pandas as pd
import numpy
df = pd.DataFrame([11, 24, 44, 101, 1234, 888], columns=['P'])# Create dataframe
bins = [0, 10, 100, 1000, 10000, 200000]
labels = ['Noob','Pro', 'Champion', 'King', 'Lord']
df['range'] = pd.cut(df.P, bins, labels = labels)
df
outputs:
P range
0 11 Pro
1 24 Pro
2 44 Pro
3 101 Champion
4 1234 King
5 888 Champion
Finally, to get the value count. Use:
my = df['range'].value_counts().sort_index()#this counts to the number of occurences
output=map(int,my.tolist())#We want the output to be integers
output
The result below:
[0, 3, 2, 1, 0]
You can use collections.Counter and a dict:
from collections import Counter
totalsales = [11, 24, 44, 101, 1234, 888]
ranking = {
0: 'noob',
10: 'pro',
100: 'champion',
1000: 'king',
10000: 'lord'
}
c = Counter()
for sale in totalsales:
for k in sorted(ranking.keys(), reverse=True):
if sale > k:
c[ranking[k]] += 1
break
Or as a two-liner (credits to #jdehesa for the idea):
thresholds = sorted(ranking.keys(), reverse=True)
c = Counter(next((ranking[t] for t in thresholds if s > t)) for s in totalsales)

From 4 given arrays(not sorted), find the elements from each array whose sum is equal to some number X

Suppose there are 4 unsorted arrays as given below:
A = [0, 100, -100, 50, 200]
B = [30, 100, 20, 0]
C = [0, 20, -1, 80]
D = [50, 0, -200, 1]
Suppose X is 0, so the few of the possible O/P should be (pick 1 element from each array which satisfy condition):
0,0,0,0
-100, 100, 0, 0
-100, 30, 20,50 .. etc.
I was able to devise the algorithm which can do this in O(n^3LogN), is there any better way to achieve the same?
My Solution:
1- Sort each array.
2- Fixed the element from array A.
3- run three loops for the rest of the arrays and take the sum of each element:
if sum > 0 (return -1, no such elements exit)
if sum == 0 (return current elements)
if sum < 0 (then advance the pointer from the array for which the current element is minimum.)
Any suggestion over this?
a kind of dynamic programming approach.
initialize sums (a dict of the form {possible_sum0: [way_to_get_sum0, ...]}) with the first list A. this results in
sums = {0: [[0]], 100: [[100]], -100: [[-100]], 50: [[50]], 200: [[200]]}
the update that dictionary with the lists B and C. sums will now contain entries like
sums = {...,
30: [[0, 30, 0]],
50: [[0, 30, 20], [50, 0, 0]],
29: [[0, 30, -1]], ...}
then in find_sum i sort the last list D and the sums for some speedup and break if a give sum X is no longer accessible.
here is the code:
from collections import defaultdict
A = [0, 100, -100, 50, 200]
B = [30, 100, 20, 0]
C = [0, 20, -1, 80]
D = [50, 0, -200, 1]
def initialize_sums(lst):
return {item: [[item]] for item in lst}
def update_sums(sums, lst):
new_sums = defaultdict(list)
for sm, ways in sums.items():
for item in lst:
new_sum = sm + item
for way in ways:
new_sums[new_sum].append(way + [item])
return new_sums
def find_sum(sums, last_lst, X):
last_lst = sorted(last_lst)
ret = []
for sm, ways in sorted(sums.items()):
for item in last_lst:
x = sm + item
if x > X:
break
if x == X:
for way in ways:
ret.append(way + [item])
break
return ret
sums = initialize_sums(lst=A)
sums = update_sums(sums, lst=B)
sums = update_sums(sums, lst=C)
ret = find_sum(sums, last_lst=D, X=0)
print(ret)
# [[-100, 30, 20, 50], [0, 0, -1, 1], [-100, 100, -1, 1], ...]
...did not analyze the overall complexity though.
We can have O(n^2) by hashing pair sums for A and B and checking if for any one of them, sum_AB[i] there might be an X - sum_AB[i] hashed in the pair sums of C and D.
In some circumstances it could be more efficient to enumerate those sums by multiplying each pair of lists as counts of coefficients in polynomials, using a FFT for O(m log m) complexity, where m is the range.
Assuming your arrays all have the same length n (+/- some constant value) you can get O(n^3) by using a set for the fourth array:
from itertools import product
ds = set(D)
for a, b, c in product(A, B, C):
d = X - a - b - c
if d in ds:
print(a, b, c, d)
If one or multiple arrays contain (many) extreme values you can also take shortcuts by checking the running sum against the min and max of subsequent arrays to see if X can still be reached. For example:
ds = set(D)
c_min, c_max = min(C), max(C)
d_min, d_max = min(ds), max(ds)
for a in A:
for b in B:
s = a + b
if s + c_min + d_min > X or s + c_max + d_max < X:
continue # Shortcut here.
for c in C:
d = X - a - b - c
if d in ds:
print(a, b, c, d)
You can further extend this by storing solutions that have already been found for a running sum (of the first two arrays for example) and hence taking a shortcut whenever such a sum is encountered again (by reordering with the min/max check one can avoid repeated computation of s + min/max values):
ds = set(D)
c_min, c_max = min(C), max(C)
d_min, d_max = min(ds), max(ds)
shortcuts = {}
for a in A:
for b in B:
s = a + b
if s in shortcuts:
for c, d in shortcuts[s]:
print(a, b, c, d)
continue
shortcuts[s] = []
if s + c_min + d_min > X or s + c_max + d_max < X:
continue
for c in C:
d = X - a - b - c
if d in ds:
print(a, b, c, d)
shortcuts[s].append((c, d))
A = [0, 100, -100, 50, 200]
B = [30, 100, 20, 0]
C = [0, 20, -1, 80]
D = [50, 0, -200, 1]
solutions = [(x1,x2,x3,x4) for x1 in A for x2 in B for x3 in C for x4 in D if sum([x1,x2,x3,x4]) == 0]
print(solutions)
Output:
>>>[(0, 0, 0, 0), (0, 0, -1, 1), (100, 100, 0, -200), (100, 20, 80, -200), (-100, 30, 20, 50), (-100, 100, 0, 0), (-100, 100, -1, 1), (-100, 20, 80, 0), (200, 0, 0, -200)]
This does exactly what you listed in your steps and works for any size, I don't know if it can get any easier finding all solutions for different list sizes.
find all combinations for an array
def dOfSums(li):
return {sum(x):x for x in sum([list(itertools.combinations(li, i)) for i in range(2,len(li))],[])}
find sums for a number in an array
def findSums(li, num):
return [((namestr(l), dOfSums(l)[num]) for l in li if num in dOfSums(l).keys() ]
name the array
def namestr(obj):
return [name for name in globals() if globals()[name] is obj].pop()
test
for el in findSums([A,B,C,D],50):
print(el)
('A', (0, 100, -100, 50))
('B', (30, 20, 0))
('D', (50, 0))
for el in findSums([A,B,C,D],100):
print(el)
('A', (0, -100, 200))
('B', (100, 0))
('C', (0, 20, 80))
for el in findSums([A,B,C,D],0):
print(el)
('A', (0, 100, -100))

How to select subset of 3D numpy array ending in a value?

I have a 3D numpy array looks like this
shape(3,1000,100)
[[[2,3,0,2,6,...,0,-1,-1,-1,-1,-1],
[1,4,6,1,4,5,3,...,1,2,6,-1,-1],
[7,4,6,3,1,0,1,...,2,0,8,-1,-1],
...
[8,7,6,4,...,2,4,5,2,1,-1]],
...,
[1,5,6,7,...,0,0,0,0,1]]]
Each lane of array end with 0 or multiple(less than 70 I'm sure) -1.
For now, I want to select only 30 values before the -1 for each lane, to make a subset of original numpy array with shape of (3,1000,30)
Should be similar like this,
[[[...,0],
[...,1,2,6],
[...,2,0,8],
...
[...,2,4,5,2,1]],
...,
[...,0,0,0,0,1]]]
Is it possible to do it with some numpy functions? Hope without a for loop:)
Here's one making use of broadcasting and advanced-indexing -
def preceedingN(a, N):
# mask of value (minus 1 here) to be found
mask = a==-1
# Get the first index with the value along the last axis.
# In case its not found, choose the last index
idx = np.where(mask.any(-1), mask.argmax(-1), mask.shape[-1])
# Get N ranged indices along the last axis
ind = idx[...,None] + np.arange(-N,0)
# Finally advanced-index and get the ranged indexed elements as the o/p
m,n,r = a.shape
return a[np.arange(m)[:,None,None], np.arange(n)[:,None], ind]
Sample run -
Setup for reproducible input :
import numpy as np
# Setup sample input array
np.random.seed(0)
m,n,r = 2,4,10
a = np.random.randint(11,99,(m,n,r))
# Select N elements off each row
N = 3
idx = np.random.randint(N,a.shape[-1]-1,(m,n))
a[idx[...,None] < np.arange(a.shape[-1])] = -1
a[0,0] = range(r) # set first row of first 2D slice to range (no -1s there)
Input, output :
>>> a
array([[[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
[81, 23, 69, 76, 50, 98, 57, -1, -1, -1],
[88, 83, 20, 31, 91, 80, 90, 58, -1, -1],
[60, 40, 30, 30, 25, 50, 43, 76, -1, -1]],
[[43, 42, 85, 34, 46, 86, 66, 39, -1, -1],
[11, 47, 64, 16, -1, -1, -1, -1, -1, -1],
[42, 12, 76, 52, 68, 46, 22, 57, -1, -1],
[25, 64, 23, 53, 95, 86, 79, -1, -1, -1]]])
>>> preceedingN(a, N=3)
array([[[ 7, 8, 9],
[50, 98, 57],
[80, 90, 58],
[50, 43, 76]],
[[86, 66, 39],
[47, 64, 16],
[46, 22, 57],
[95, 86, 79]]])
This is a solution based on the idea of calculating the indices which should be kept. We use numpy.argmin and numpy.nonzero to find the start of the -1 or the end of the row, then use 2-dimensional addition/subtraction to build the indices of the 30 elements that need to be kept.
First off, we create reproducible example data
import numpy as np
np.random.seed(0)
a = np.random.randint(low=0, high=10, size=(3, 1000, 100))
for i in range(3):
for j in range(1000):
a[i, j, np.random.randint(low=30, high=a.shape[2]+1):] = -1
You can of course skip this step, I just added it to allow others to reproduce this solution. :)
Now let's go through the code step-by-step:
Change shape of a to simplify code
a.shape = 3 * 1000, 100
Find index of -1 in each row of a
i = np.argmin(a, axis=1)
Replace any indices for rows of a with no -1 by a 100
i[np.nonzero(a[np.arange(a.shape[0]), i] != -1)] = a.shape[1]
Translate indices to 1D
i = i + a.shape[1] * np.arange(a.shape[0])
Calculate indices for all elements that should be kept. We make the indices two-dimensional so that we get the 30 indices before each -1 index.
i = i.reshape(a.shape[0], 1) - 30 + np.arange(30).reshape(1, 30)
a.shape = 3 * 1000 * 100
Perform filtering
a = a[i]
Return a to desired shape
a.shape = 3, 1000, 30
Here's one using stride_tricks for efficient retrieval of slices:
import numpy as np
from numpy.lib.stride_tricks import as_strided
# create mock data
data = np.random.randint(0, 9, (3, 1000, 100))
fstmone = np.random.randint(30, 101, (3, 1000))
data[np.arange(100) >= fstmone[..., None]] = -1
# count -1s (ok, this is a bit wasteful compared to #Divakar's)
aux = np.where(data[..., 30:] == -1, 1, -100)
nmones = np.maximum(np.max(np.cumsum(aux[..., ::-1], axis=-1), axis=-1), 0)
# slice (but this I'd expect to be faster)
sliceable = as_strided(data, data.shape[:2] + (71, 30),
data.strides + data.strides[2:])
result = sliceable[np.arange(3)[:, None], np.arange(1000)[None, :], 70-nmones, :]
Benchmarks, best solution is a hybrid of #Divakar's and mine, #Florian Rhiem's is also quite fast:
import numpy as np
from numpy.lib.stride_tricks import as_strided
# create mock data
data = np.random.randint(0, 9, (3, 1000, 100))
fstmone = np.random.randint(30, 101, (3, 1000))
data[np.arange(100) >= fstmone[..., None]] = -1
def pp(data, N):
# count -1s
aux = np.where(data[..., N:] == -1, 1, -data.shape[-1])
nmones = np.maximum(np.max(np.cumsum(aux[..., ::-1], axis=-1), axis=-1), 0)
# slice
sliceable = as_strided(data, data.shape[:2] + (data.shape[-1]-N+1, N),
data.strides + data.strides[2:])
return sliceable[np.arange(data.shape[0])[:, None],
np.arange(data.shape[1])[None, :],
data.shape[-1]-N-nmones, :]
def Divakar(data, N):
# mask of value (minus 1 here) to be found
mask = data==-1
# Get the first index with the value along the last axis.
# In case its not found, choose the last index
idx = np.where(mask.any(-1), mask.argmax(-1), mask.shape[-1])
# Get N ranged indices along the last axis
ind = idx[...,None] + np.arange(-N,0)
# Finally advanced-index and get the ranged indexed elements as the o/p
m,n,r = data.shape
return data[np.arange(m)[:,None,None], np.arange(n)[:,None], ind]
def combined(data, N):
# mix best of Divakar's and mine
mask = data==-1
idx = np.where(mask.any(-1), mask.argmax(-1), mask.shape[-1])
sliceable = as_strided(data, data.shape[:2] + (data.shape[-1]-N+1, N),
data.strides + data.strides[2:])
return sliceable[np.arange(data.shape[0])[:, None],
np.arange(data.shape[1])[None, :],
idx-N, :]
def fr(data, N):
data = data.reshape(-1, data.shape[-1])
i = np.argmin(data, axis=1)
i[np.nonzero(data[np.arange(data.shape[0]), i] != -1)] = data.shape[1]
i = i + data.shape[1] * np.arange(data.shape[0])
i = i.reshape(data.shape[0], 1) - N + np.arange(N).reshape(1, N)
data.shape = -1,
res = data[i]
res.shape = 3, 1000, 30
return res
print(np.all(combined(data, 30) == Divakar(data, 30)))
print(np.all(combined(data, 30) == pp(data, 30)))
print(np.all(combined(data, 30) == fr(data, 30)))
from timeit import timeit
for func in pp, Divakar, combined, fr:
print(timeit('f(data, 30)', number=100, globals={'f':func, 'data':data}))
Sample output:
True
True
True
0.2767702739802189
0.13680238201050088
0.060565065999981016
0.0795100320247002

Categories

Resources