Hotelling's T^2 scores in python - python

I applied pca on a data set using matplotlib in python. However, matplotlib does not provide a t-squared scores like Matlab. Is there a way to compute Hotelling's T^2 score like Matlab?
Thanks.

matplotlib's PCA class doesn't include the Hotelling T2 calculation, but it can be done with just a couple lines of code. The following code includes a function to compute the T2 values for each point. The __main__ script applies PCA to the same example as used in Matlab's pca documentation, so you can verify that the function generates the same values as Matlab.
from __future__ import print_function, division
import numpy as np
from matplotlib.mlab import PCA
def hotelling_tsquared(pc):
"""`pc` should be the object returned by matplotlib.mlab.PCA()."""
x = pc.a.T
cov = pc.Wt.T.dot(np.diag(pc.s)).dot(pc.Wt) / (x.shape[1] - 1)
w = np.linalg.solve(cov, x)
t2 = (x * w).sum(axis=0)
return t2
if __name__ == "__main__":
hald_text = """Y X1 X2 X3 X4
78.5 7 26 6 60
74.3 1 29 15 52
104.3 11 56 8 20
87.6 11 31 8 47
95.9 7 52 6 33
109.2 11 55 9 22
102.7 3 71 17 6
72.5 1 31 22 44
93.1 2 54 18 22
115.9 21 47 4 26
83.8 1 40 23 34
113.3 11 66 9 12
109.4 10 68 8 12
"""
hald = np.loadtxt(hald_text.splitlines(), skiprows=1)
ingredients = hald[:, 1:]
pc = PCA(ingredients, standardize=False)
coeff = pc.Wt
np.set_printoptions(precision=4)
# For coeff and latent, compare to
# http://www.mathworks.com/help/stats/pca.html#btjpztu-1
print("coeff:")
print(coeff)
print()
latent = pc.s / (ingredients.shape[0] - 1)
print("latent:" + (" %9.4f"*len(latent)) % tuple(latent))
print()
# For tsquared, compare to
# http://www.mathworks.com/help/stats/pca.html#bti6r0c-1
tsquared = hotelling_tsquared(pc)
print("tsquared:")
print(tsquared)
Output:
coeff:
[[ 0.0678 0.6785 -0.029 -0.7309]
[ 0.646 0.02 -0.7553 0.1085]
[-0.5673 0.544 -0.4036 0.4684]
[ 0.5062 0.4933 0.5156 0.4844]]
latent: 517.7969 67.4964 12.4054 0.2372
tsquared:
[ 5.6803 3.0758 6.0002 2.6198 3.3681 0.5668 3.4818 3.9794 2.6086
7.4818 4.183 2.2327 2.7216]

Even though this is an old question, I am posting the code as it may help someone.
Here is the code, as a bonus this does multiple hotelling tests at once
import numpy as np
from scipy.stats import f as f_distrib
def hotelling_t2(X, Y):
# X and Y are 3D arrays
# dim 0: number of features
# dim 1: number of subjects
# dim 2: number of mesh nodes or voxels (numer of tests)
nx = X.shape[1]
ny = Y.shape[1]
p = X.shape[0]
Xbar = X.mean(1)
Ybar = Y.mean(1)
Xbar = Xbar.reshape(Xbar.shape[0], 1, Xbar.shape[1])
Ybar = Ybar.reshape(Ybar.shape[0], 1, Ybar.shape[1])
X_Xbar = X - Xbar
Y_Ybar = Y - Ybar
Wx = np.einsum('ijk,ljk->ilk', X_Xbar, X_Xbar)
Wy = np.einsum('ijk,ljk->ilk', Y_Ybar, Y_Ybar)
W = (Wx + Wy) / float(nx + ny - 2)
Xbar_minus_Ybar = Xbar - Ybar
x = np.linalg.solve(W.transpose(2, 0, 1),
Xbar_minus_Ybar.transpose(2, 0, 1))
x = x.transpose(1, 2, 0)
t2 = np.sum(Xbar_minus_Ybar * x, 0)
t2 = t2 * float(nx * ny) / float(nx + ny)
stat = (t2 * float(nx + ny - 1 - p) / (float(nx + ny - 2) * p))
pval = 1 - np.squeeze(f_distrib.cdf(stat, p, nx + ny - 1 - p))
return pval, t2

Related

Why numpy vectorization is slower than a for loop

The below code has two functions that does the same thing: checks to see if the line between two points intersects with a circle.
from line_profiler import LineProfiler
from math import sqrt
import numpy as np
class Point:
x: float
y: float
def __init__(self, x: float, y: float):
self.x = x
self.y = y
def __repr__(self):
return f"Point(x={self.x}, y={self.y})"
class Circle:
ctr: Point
r: float
def __init__(self, ctr: Point, r: float):
self.ctr = ctr
self.r = r
def __repr__(self):
return f"Circle(r={self.r}, ctr={self.ctr})"
def loop(p1: Point, p2: Point, circles: list[Circle]):
m = (p1.y - p2.y) / (p1.x - p2.x)
n = p1.y - m * p1.x
max_x = max(p1.x, p2.x)
min_x = min(p1.x, p2.x)
for circle in circles:
if sqrt((circle.ctr.x - p1.x) ** 2 + (circle.ctr.y - p1.y) ** 2) < circle.r \
or sqrt((circle.ctr.x - p2.x) ** 2 + (circle.ctr.y - p2.y) ** 2) < circle.r:
return False
a = m ** 2 + 1
b = 2 * (m * n - m * circle.ctr.y - circle.ctr.x)
c = circle.ctr.x ** 2 + circle.ctr.y ** 2 + n ** 2 - circle.r ** 2 - 2 * n * circle.ctr.y
# compute the intersection points
discriminant = b ** 2 - 4 * a * c
if discriminant <= 0:
# no real roots, the line does not intersect the circle
continue
# two real roots, the line intersects the circle at two points
x1 = (-b + sqrt(discriminant)) / (2 * a)
x2 = (-b - sqrt(discriminant)) / (2 * a)
# check if both points in range
first = min_x <= x1 <= max_x
second = min_x <= x2 <= max_x
if first and second:
return False
return True
def vectorized(p1: Point, p2: Point, circles):
m = (p1.y - p2.y) / (p1.x - p2.x)
n = p1.y - m * p1.x
max_x = max(p1.x, p2.x)
min_x = min(p1.x, p2.x)
circle_ctr_x = circles['x']
circle_ctr_y = circles['y']
circle_radius = circles['r']
# Pt 1 inside circle
if np.any(np.sqrt((circle_ctr_x - p1.x) ** 2 + (circle_ctr_y - p1.y) ** 2) < circle_radius):
return False
# Pt 2 inside circle
if np.any(np.sqrt((circle_ctr_x - p2.x) ** 2 + (circle_ctr_y - p2.y) ** 2) < circle_radius):
return False
# Line intersects with circle in range
a = m ** 2 + 1
b = 2 * (m * n - m * circle_ctr_y - circle_ctr_x)
c = circle_ctr_x ** 2 + circle_ctr_y ** 2 + n ** 2 - circle_radius ** 2 - 2 * n * circle_ctr_y
# compute the intersection points
discriminant = b**2 - 4*a*c
discriminant_bigger_than_zero = discriminant > 0
discriminant = discriminant[discriminant_bigger_than_zero]
if discriminant.size == 0:
return True
b = b[discriminant_bigger_than_zero]
# two real roots, the line intersects the circle at two points
x1 = (-b + np.sqrt(discriminant)) / (2 * a)
x2 = (-b - np.sqrt(discriminant)) / (2 * a)
# check if both points in range
in_range = (min_x <= x1) & (x1 <= max_x) & (min_x <= x2) & (x2 <= max_x)
return not np.any(in_range)
a = Point(x=-2.47496075130008, y=1.3609840363748935)
b = Point(x=3.4637947060471084, y=-3.7779123453298817)
c = [Circle(r=1.2587063082677084, ctr=Point(x=3.618533781361757, y=2.179925931180058)), Circle(r=0.7625751871124099, ctr=Point(x=-0.3173290200183132, y=4.256206636932641)), Circle(r=0.4926043225930364, ctr=Point(x=-4.626312261120341, y=-1.5754603504419196)), Circle(r=0.6026364956540792, ctr=Point(x=3.775240278691819, y=1.7381168262343072)), Circle(r=1.2804597877349562, ctr=Point(x=4.403273380178893, y=-1.6890127555343681)), Circle(r=1.1562415624767421, ctr=Point(x=-1.0675000352105801, y=-0.23952113329203994)), Circle(r=1.112718432321835, ctr=Point(x=2.500137075066017, y=-2.77748519509295)), Circle(r=0.979889574640609, ctr=Point(x=4.494971251199753, y=-1.0530995423779388)), Circle(r=0.7817624050358268, ctr=Point(x=3.2419454348696544, y=4.3303373486692465)), Circle(r=1.0271176198616367, ctr=Point(x=-0.9740272820753071, y=-4.282195116754338)), Circle(r=1.1585218836700681, ctr=Point(x=-0.42096876790888915, y=2.135161027254492)), Circle(r=1.0242603387003988, ctr=Point(x=2.2617850544260767, y=-4.59942951839469)), Circle(r=1.5704233297828027, ctr=Point(x=-1.1182365440831088, y=4.2411408333943506)), Circle(r=0.37137272043983655, ctr=Point(x=3.280499587987774, y=-4.87871834733383)), Circle(r=1.1829610109115543, ctr=Point(x=-0.27755604766113606, y=-3.68429580935016)), Circle(r=1.0993567600839198, ctr=Point(x=0.23602306761027925, y=0.47530122196024704)), Circle(r=1.3865045367147553, ctr=Point(x=-2.537565761732492, y=4.719766182202855)), Circle(r=0.9492796511909753, ctr=Point(x=-3.7047245796551973, y=-2.501817905967274)), Circle(r=0.9866916911482386, ctr=Point(x=1.3021813533479742, y=4.754952371169189)), Circle(r=0.9053004331885084, ctr=Point(x=-3.4912157984801784, y=-0.5269727600532836)), Circle(r=1.3058987272565075, ctr=Point(x=-1.6983878085276427, y=-2.2910189455221053)), Circle(r=0.5342716756987732, ctr=Point(x=4.948676886704507, y=-1.2467089784975183)), Circle(r=1.0603926633240575, ctr=Point(x=-4.390462974765324, y=0.785568745976325)), Circle(r=0.3448422804513971, ctr=Point(x=-1.6459756952994697, y=2.7608629057950362)), Circle(r=0.8521457455807724, ctr=Point(x=-4.503217369041699, y=3.93796926957188)), Circle(r=0.602438849989669, ctr=Point(x=-2.0703406576157493, y=0.6142570312870999)), Circle(r=0.6453692950682722, ctr=Point(x=-0.14802220452893144, y=4.08189682338989)), Circle(r=0.6983361689325062, ctr=Point(x=0.09362196694661651, y=-1.0953438275586391)), Circle(r=1.880331563921456, ctr=Point(x=0.23481661751521776, y=-4.09217120864087)), Circle(r=0.5766225363413416, ctr=Point(x=3.149434524126505, y=-4.639582956406762)), Circle(r=0.6177559628867022, ctr=Point(x=-1.6758918144661683, y=-0.7954935787503492)), Circle(r=0.7347952666955615, ctr=Point(x=-3.1907522890427575, y=0.7048509241855683)), Circle(r=1.2795003337464894, ctr=Point(x=-1.777244415863577, y=2.936422879898364)), Circle(r=0.9181024765780231, ctr=Point(x=4.212544425778317, y=-1.953546993038261)), Circle(r=1.7681384709020282, ctr=Point(x=-1.3702722387909405, y=-1.7013020424154368)), Circle(r=0.5420789771729688, ctr=Point(x=4.063803796292818, y=-3.7159871611415065)), Circle(r=1.3863651881788939, ctr=Point(x=0.7685002210812408, y=-3.994230705171357)), Circle(r=0.5739750223225826, ctr=Point(x=0.08779554290638258, y=4.879912451441914)), Circle(r=1.2019825386919343, ctr=Point(x=-4.206623233886995, y=-1.1617382464768689))]
circle_dt = np.dtype('float,float,float')
circle_dt.names = ['x', 'y', 'r']
np_c = np.array([(x.ctr.x, x.ctr.y, x.r) for x in c], dtype=circle_dt)
lp1 = LineProfiler()
loop_wrapper = lp1(loop)
loop_wrapper(a, b, c)
lp1.print_stats()
lp2 = LineProfiler()
vectorized_wrapper = lp2(vectorized)
vectorized_wrapper(a, b, np_c)
lp2.print_stats()
One implementation is regular for loop implementation, and the other is vectorized implementation with numpy.
From my small knowledge of vectorization, I would have guessed that the vectorized function would yield better result, but as you can see below that is not the case:
Total time: 4.36e-05 s
Function: loop at line 31
Line # Hits Time Per Hit % Time Line Contents
==============================================================
31 def loop(p1: Point, p2: Point, circles: list[Circle]):
32 1 9.0 9.0 2.1 m = (p1.y - p2.y) / (p1.x - p2.x)
33 1 5.0 5.0 1.1 n = p1.y - m * p1.x
34
35 1 19.0 19.0 4.4 max_x = max(p1.x, p2.x)
36 1 5.0 5.0 1.1 min_x = min(p1.x, p2.x)
37
38 6 30.0 5.0 6.9 for circle in circles:
39 6 73.0 12.2 16.7 if sqrt((circle.ctr.x - p1.x) ** 2 + (circle.ctr.y - p1.y) ** 2) < circle.r \
40 6 62.0 10.3 14.2 or sqrt((circle.ctr.x - p2.x) ** 2 + (circle.ctr.y - p2.y) ** 2) < circle.r:
41 return False
42
43 6 29.0 4.8 6.7 a = m ** 2 + 1
44 6 32.0 5.3 7.3 b = 2 * (m * n - m * circle.ctr.y - circle.ctr.x)
45 6 82.0 13.7 18.8 c = circle.ctr.x ** 2 + circle.ctr.y ** 2 + n ** 2 - circle.r ** 2 - 2 * n * circle.ctr.y
46
47 # compute the intersection points
48 6 33.0 5.5 7.6 discriminant = b ** 2 - 4 * a * c
49 5 11.0 2.2 2.5 if discriminant <= 0:
50 # no real roots, the line does not intersect the circle
51 5 22.0 4.4 5.0 continue
52
53 # two real roots, the line intersects the circle at two points
54 1 7.0 7.0 1.6 x1 = (-b + sqrt(discriminant)) / (2 * a)
55 1 4.0 4.0 0.9 x2 = (-b - sqrt(discriminant)) / (2 * a)
56
57 # check if one point in range
58 1 5.0 5.0 1.1 first = min_x < x1 < max_x
59 1 3.0 3.0 0.7 second = min_x < x2 < max_x
60 1 2.0 2.0 0.5 if first and second:
61 1 3.0 3.0 0.7 return False
62
63 return True
Total time: 0.0001534 s
Function: vectorized at line 66
Line # Hits Time Per Hit % Time Line Contents
==============================================================
66 def vectorized(p1: Point, p2: Point, circles):
67 1 10.0 10.0 0.7 m = (p1.y - p2.y) / (p1.x - p2.x)
68 1 5.0 5.0 0.3 n = p1.y - m * p1.x
69
70 1 7.0 7.0 0.5 max_x = max(p1.x, p2.x)
71 1 4.0 4.0 0.3 min_x = min(p1.x, p2.x)
72
73 1 10.0 10.0 0.7 circle_ctr_x = circles['x']
74 1 3.0 3.0 0.2 circle_ctr_y = circles['y']
75 1 3.0 3.0 0.2 circle_radius = circles['r']
76
77 # Pt 1 inside circle
78 1 652.0 652.0 42.5 if np.any(np.sqrt((circle_ctr_x - p1.x) ** 2 + (circle_ctr_y - p1.y) ** 2) < circle_radius):
79 return False
80 # Pt 2 inside circle
81 1 161.0 161.0 10.5 if np.any(np.sqrt((circle_ctr_x - p2.x) ** 2 + (circle_ctr_y - p2.y) ** 2) < circle_radius):
82 return False
83 # Line intersects with circle in range
84 1 13.0 13.0 0.8 a = m ** 2 + 1
85 1 120.0 120.0 7.8 b = 2 * (m * n - m * circle_ctr_y - circle_ctr_x)
86 1 77.0 77.0 5.0 c = circle_ctr_x ** 2 + circle_ctr_y ** 2 + n ** 2 - circle_radius ** 2 - 2 * n * circle_ctr_y
87
88 # compute the intersection points
89 1 25.0 25.0 1.6 discriminant = b**2 - 4*a*c
90 1 46.0 46.0 3.0 discriminant_bigger_than_zero = discriminant > 0
91 1 56.0 56.0 3.7 discriminant = discriminant[discriminant_bigger_than_zero]
92
93 1 6.0 6.0 0.4 if discriminant.size == 0:
94 return True
95
96 1 12.0 12.0 0.8 b = b[discriminant_bigger_than_zero]
97
98 # two real roots, the line intersects the circle at two points
99 1 77.0 77.0 5.0 x1 = (-b + np.sqrt(discriminant)) / (2 * a)
100 1 28.0 28.0 1.8 x2 = (-b - np.sqrt(discriminant)) / (2 * a)
101
102 # check if both points in range
103 1 96.0 96.0 6.3 in_range = (min_x <= x1) & (x1 <= max_x) & (min_x <= x2) & (x2 <= max_x)
104 1 123.0 123.0 8.0 return not np.any(in_range)
For some reason the non vectorized function runs faster.
My simple guess is that it is because the vectorized function runs over the whole array every time and the non vectorized one stops in the middle when it find a circle intersections.
So my questions are:
Is there a numpy function which doesn't iterate over the whole array but stops when the results are false?
What is the reason the vectorized function takes longer to run?
Any general optimization suggestions would be appreciated
Is there a numpy function which doesn't iterate over the whole array but stops when the results are false?
No. This is a long standing feature requested by Numpy users but it will certainly never be added to Numpy. For simple cases, like returning the first index of a boolean array, Numpy could implement that, but the thing is the boolean array needs to be fully created in the first place. In order to support the general case, Numpy should merge multiple operations and do some kind of lazy computation. This basically means rewriting completely Numpy from scratch for an efficient implementation (which is a huge work).
If you need to do that, there are two main solution:
operating on chunks so for the computation to stop early (while computing up to len(chunk) additional items);
writing your own fast compiled implementation using Numba or Cython (with views).
What is the reason the vectorized function takes longer to run?
The input is pretty small and Numpy is not optimized for small arrays. Indeed, each call to a Numpy function typically takes 0.4-4 us on a mainstream processor (like my i5-9600KF). This is because Numpy as many checks to do, new arrays to allocates, generic internal iterators to build, etc. As a result, a line like np.any(np.sqrt((circle_ctr_x - p1.x) ** 2 + (circle_ctr_y - p1.y) ** 2) < circle_radius) doing 8 Numpy calls and creating 7 temporary arrays takes about 8 us on my machine. The second similar line takes the same time. Together, they are already slower than the non-vectorized version.
As pointed out in the question and the comments, the non-vectorized function can stop early and this can also help the non-vectorized version to be even faster than the other.
Any general optimization suggestions would be appreciated
Regarding your code, using Numba (with plain loops and Numpy arrays) is certainly a good idea for performance. Note the first call can be slower due to the compilation time (you can provide the signature to do this at loading time or just use an AOT compiler including Cython).
Note that array of structure are generally not efficient since they prevent the efficient use of SIMD instructions. They are also certainly not efficiently computed by Numpy since the datatype is dynamically created and the Numpy code is already compiled ahead of time (so it cannot implement function for this specific datatype and has to use generic dynamic operation on each item of the array which is significantly slower than basic datatypes). Please consider using structure of arrays. For more information please read this post and more generally this post.

Error calculating entropy over pandas series

I'm trying to calculate the entropy over a pandas series. Specifically, I group the strings in Direction as a sequence. Specifically, using this function:
diff_dir = df.iloc[0:,1].ne(df.iloc[0:,1].shift()).cumsum()
will return the count of strings in Direction that are the same until a change. So for each sequence of the same Direction string, I want to calculate the entropy of X,Y.
Using the code the sequencing of the same string is:
0 1
1 1
2 1
3 1
4 1
5 2
6 2
7 2
8 3
9 3
This code used to work but it's now returning an error. I'm not sure if this was after an upgrade.
import pandas as pd
import numpy as np
def ApEn(U, m = 2, r = 0.2):
'''
Approximate Entropy
Quantify the amount of regularity over time-series data.
Input parameters:
U = Time series
m = Length of compared run of data (subseries length)
r = Filtering level (tolerance). A positive number
'''
def _maxdist(x_i, x_j):
return max([abs(ua - va) for ua, va in zip(x_i, x_j)])
def _phi(m):
x = [U.tolist()[i:i + m] for i in range(N - m + 1)]
C = [len([1 for x_j in x if _maxdist(x_i, x_j) <= r]) / (N - m + 1.0) for x_i in x]
return (N - m + 1.0)**(-1) * sum(np.log(C))
N = len(U)
return abs(_phi(m + 1) - _phi(m))
def Entropy(df):
'''
Calculate entropy for individual direction
'''
df = df[['Time','Direction','X','Y']]
diff_dir = df.iloc[0:,1].ne(df.iloc[0:,1].shift()).cumsum()
# Calculate ApEn grouped by direction.
df['ApEn_X'] = df.groupby(diff_dir)['X'].transform(ApEn)
df['ApEn_Y'] = df.groupby(diff_dir)['Y'].transform(ApEn)
return df
df = pd.DataFrame(np.random.randint(0,50, size = (10, 2)), columns=list('XY'))
df['Time'] = range(1, len(df) + 1)
direction = ['Left','Left','Left','Left','Left','Right','Right','Right','Left','Left']
df['Direction'] = direction
# Calculate defensive regularity
entropy = Entropy(df)
Error:
return (N - m + 1.0)**(-1) * sum(np.log(C))
ZeroDivisionError: 0.0 cannot be raised to a negative power
The issue is because of the below code
(N - m + 1.0)**(-1)
consider the situation when N==1 and since N = len(U) this happens when the a group resulted out of groupby will have size of 1. Since m==2 this end up as
(1-2+1)**-1 == 0
And we 0**-1 is undefined as so the error.
Now if we look theoretically, how do you define a approximate entropy of a timeseries with just one value; highly unpredictable so it should be as high as possible. For this case let us set it to np.nan to denote it is not defined (entropy is always greater then equal to 0)
code
import pandas as pd
import numpy as np
def ApEn(U, m = 2, r = 0.2):
'''
Approximate Entropy
Quantify the amount of regularity over time-series data.
Input parameters:
U = Time series
m = Length of compared run of data (subseries length)
r = Filtering level (tolerance). A positive number
'''
def _maxdist(x_i, x_j):
return max([abs(ua - va) for ua, va in zip(x_i, x_j)])
def _phi(m):
x = [U.tolist()[i:i + m] for i in range(N - m + 1)]
C = [len([1 for x_j in x if _maxdist(x_i, x_j) <= r]) / (N - m + 1.0) for x_i in x]
if (N - m + 1) == 0:
return np.nan
return (N - m + 1)**(-1) * sum(np.log(C))
N = len(U)
return abs(_phi(m + 1) - _phi(m))
def Entropy(df):
'''
Calculate entropy for individual direction
'''
df = df[['Time','Direction','X','Y']]
diff_dir = df.iloc[0:,1].ne(df.iloc[0:,1].shift()).cumsum()
# Calculate ApEn grouped by direction.
df['ApEn_X'] = df.groupby(diff_dir)['X'].transform(ApEn)
df['ApEn_Y'] = df.groupby(diff_dir)['Y'].transform(ApEn)
return df
np.random.seed(0)
df = pd.DataFrame(np.random.randint(0,50, size = (10, 2)), columns=list('XY'))
df['Time'] = range(1, len(df) + 1)
direction = ['Left','Left','Left','Left','Left','Right','Right','Right','Left','Left']
df['Direction'] = direction
# Calculate defensive regularity
print (Entropy(df))
Output:
Time Direction X Y ApEn_X ApEn_Y
0 1 Left 6 16 0.287682 0.287682
1 2 Left 22 6 0.287682 0.287682
2 3 Left 16 5 0.287682 0.287682
3 4 Left 5 48 0.287682 0.287682
4 5 Left 11 21 0.287682 0.287682
5 6 Right 44 25 0.693147 0.693147
6 7 Right 14 12 0.693147 0.693147
7 8 Right 43 40 0.693147 0.693147
8 9 Left 46 44 NaN NaN
9 10 Left 49 2 NaN NaN
Larger sample (which results in 0**-1 issue)
np.random.seed(0)
df = pd.DataFrame(np.random.randint(0,50, size = (100, 2)), columns=list('XY'))
df['Time'] = range(1, len(df) + 1)
direction = ['Left','Right','Up','Down']
df['Direction'] = np.random.choice((direction), len(df))
print (Entropy(df))
Output:
Time Direction X Y ApEn_X ApEn_Y
0 1 Left 44 47 NaN NaN
1 2 Left 0 3 NaN NaN
2 3 Down 3 39 NaN NaN
3 4 Right 9 19 NaN NaN
4 5 Up 21 36 NaN NaN
.. ... ... .. .. ... ...
95 96 Up 19 33 NaN NaN
96 97 Left 40 32 NaN NaN
97 98 Up 36 6 NaN NaN
98 99 Left 21 31 NaN NaN
99 100 Right 13 7 NaN NaN
It appears that when the ApEn._phi() function is invoked, it is possible that the specific values for N and m end up returning a 0. That then needs to be raised to the negative power of -1, which however is undefined (see also Why does zero raised to the power of negative one equal infinity?).
To illustrate, I tried to replicate your scenario specifically, and in the first iteration of the transform operation, this is what happens:
U is: 1 0
2 48
(the first groupby has 2 elements)
N is: 2
m is: 3
So effectively when you get to the return value of _phi(), you are doing (N - m + 1.0)**-1 = (2 - 3 + 1)**-1 = 0**-1, which is undefined. Perhaps the key here is that you say you're groupby by individual direction and passing the U array into the Approximate Entropy function, however you're grouping by diff_X and diff_Y instead, which result in very small groups due to the nature of the method applied. As far as I understand, if you want to calculate the approximate entropy per direction, you simply need to group by 'Direction':
def Entropy(df):
'''
Calculate entropy for individual direction
'''
# Calculate ApEn grouped by direction.
df['ApEn_X'] = df.groupby('Direction')['X'].transform(ApEn)
df['ApEn_Y'] = df.groupby('Direction')['Y'].transform(ApEn)
return df
This results in a dataframe like this:
entropy.head()
Time Direction X Y ApEn_X ApEn_Y
0 1 Left 28 47 0.035091 0.035091
1 2 Up 8 47 0.013493 0.046520
2 3 Up 0 32 0.013493 0.046520
3 4 Right 34 8 0.044452 0.044452
4 5 Right 49 27 0.044452 0.044452
You have to handle your ZeroDivisions. Maybe this way:
def _phi(m):
if N == m - 1:
return 0
...
You will then encounter length mismatches on groupbys, df and diff_X has to be of same lengths.

Finding the number of k-primes within a range

Write function count_Kprimes with given parameters k, start, nd, that returns a list of the k-primes between start (inclusive) and end (inclusive).
Here is my attempt:
def count_Kprimes(k, start, nd):
ls = []
for x in range(start, nd + 1):
y = x
l = []
for i in range(2, x + 1):
while y % i == 0:
l.append(i)
y /= i
if len(l) == k:
ls.append(x)
return ls
However, my code takes too much time to process and I want to simply my code. How can it be done? Thank you so much!
This task is taken from Codewar
Well, I had fun solving this anyway. Here is a solution based on array-logic
def count_Kprimes(k, start, nd):
x = np.arange(start, nd + 1, dtype=np.float)
# divs will contain all divisors (plus one extra column)
divs = np.ones((x.size, k + 1))
# we have to loop only nd / 2^(k-1) to get all divisors
for i in range(2, int(nd / 2 ** (k - 1)) + 1):
# but each possible divisor "i" may occur up to k times
# we loop until k+1 to catch also number that exceed our target,
# so we can discard them later
for j in range(1, k + 2):
# check for each row (dimension 0) if i is a divisor
# then set the first zero-value in dimension 1 to be this divisor
d = np.prod(divs, axis=1)
divs[[[np.rint(x/d/i)==x/d/i][0],np.argmin(divs[np.rint(x/d/i)==x/d/i], axis=1)]] = i
# The correct result we're looking for is each row that has exactly
# k values != 1 (which equals to exactly one "1" per row)
indices = np.apply_along_axis(lambda x: x[x==1].size == 1, 1, divs)
for val, d in zip(x[indices], divs[indices]):
print "{} = {}".format(int(val), " * ".join([str(int(_)) for _ in d[:-1]]))
count_Kprimes(3, 1, 100)
returns
8 = 2 * 2 * 2
12 = 2 * 2 * 3
18 = 2 * 3 * 3
20 = 2 * 2 * 5
27 = 3 * 3 * 3
28 = 2 * 2 * 7
30 = 2 * 3 * 5
42 = 2 * 3 * 7
44 = 2 * 2 * 11
45 = 3 * 3 * 5
50 = 2 * 5 * 5
52 = 2 * 2 * 13
63 = 3 * 3 * 7
66 = 2 * 3 * 11
68 = 2 * 2 * 17
70 = 2 * 5 * 7
75 = 3 * 5 * 5
76 = 2 * 2 * 19
78 = 2 * 3 * 13
92 = 2 * 2 * 23
98 = 2 * 7 * 7
99 = 3 * 3 * 11

Running function until pearson correlation function of >0.99 is achieved

I have the following code:
current code:
import math
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import linregress
c1_high = 98
c1_low = 75
c2_high = 15
c2_low = 6
c3_high = 8
c3_low = 2
def mix_gen(number):
flag = 0
container = []
y_array = [1,2,3,4,5,6,7,8,9,10,11]
while flag < number:
c1 = np.random.uniform(c1_low, c1_high)
c2 = np.random.uniform(c2_low, c2_high)
c3 = np.random.uniform(c3_low, c3_high)
tot = c1+c2+c3
if 99.99 <= tot <= 100.01:
flag += 1
container.append([c1,c2,c3])
return container
def average(x):
assert len(x) > 0
return float(sum(x)) / len(x)
def pearson_def(x, y):
assert len(x) == len(y)
n = len(x)
assert n > 0
avg_x = average(x)
avg_y = average(y)
diffprod = 0
xdiff2 = 0
ydiff2 = 0
for idx in range(n):
xdiff = x[idx] - avg_x
ydiff = y[idx] - avg_y
diffprod += xdiff * ydiff
xdiff2 += xdiff * xdiff
ydiff2 += ydiff * ydiff
return diffprod / math.sqrt(xdiff2 * ydiff2)
def corr_check():
while True:
mixes = mix_gen(5)
mixes_C1 =[item[0] for item in mixes]
mixes_C2 =[item[1] for item in mixes]
mixes_C3 =[item[2] for item in mixes]
mylen = [1,2,3,4,5]
c1_r = pearson_def(mixes_C1, mylen)
c2_r = pearson_def(mixes_C2, mylen)
c3_r = pearson_def(mixes_C3, mylen)
if c1_r >0.99 and c2_r >0.99 and c3_r>0.99:
print(mixes)
print (c1_r)
else:
continue
corr = corr_check()
print(corr)
This code provides me with effectively (when converted to a dataframe) the following output:
C1 C2 C3 sum range
1 70 20 10 100 ^
2 .. |
3 .. |
4 .. |
5 .. |
6 .. |
7 .. |
8 .. |
9 .. |
10 .. |
11 90 _
I require the sum of each row to be equal to 100 and each column to have an r^2 value (Pearson Corr.) to be > 0.99.
However, the complexity and number of iterations required renders the problem almost impossible to solve. Is there a better way of achieving this goal instead of trying to rely on the initial random number generation for all three components C1, C2 and C3?

Numpy array index out of range with Genetic Algorithm

I wrote a script to generate an image based from a source, with randomized ellipses using genetic algorithms. I keep receiving this error (the length of seeds is different every time, this is just an example) after running it:
Output:
[[ 42 166 88 21]
[ 25 201 321 227]
[ 21 78 153 53]
[ 5 74 231 20]
[ 3 96 394 15]
[ 20 239 28 244]
[ 33 6 94 27]
[ 4 253 193 113]
[ 10 139 323 16]
[ 31 9 97 117]
[ 23 273 181 214]
[ 24 286 361 231]
[ 33 2 187 47]
[ 35 98 133 177]
[ 10 307 136 76]
[ 35 132 269 161]
[ 25 147 11 2]
[ 36 141 338 100]
[ 23 163 430 37]
[ 17 285 216 53]
[ 18 2 181 119]
[ 43 199 117 253]] 22
Traceback (most recent call last):
File "E:/genetic image/genetic_image.py", line 106, in <module>
generate()
File "E:/genetic image/genetic_image.py", line 93, in generate
params, test_image = seed_test(seeds[:random.randint(0, reproduce)])
File "E:/genetic image/genetic_image.py", line 41, in seed_test
r = int(seeds[i, 0] + random.random() - 0.5)
IndexError: index (22) out of range (0<=index<22) in dimension 0
Here is the script:
import random
import copy
import numpy
from PIL import Image, ImageDraw
optimal = Image.open("charles-darwin_large.jpg")
optimal = optimal.convert("RGB")
size = width, height = optimal.size
population = 2
generations = 5000
elements = int(1e3)
reproduce = height / 10
max_radius = height / 10
diff_max = height / 10
def random_test():
test_elements = []
test_image = Image.new("RGB", (width, height), "white")
draw = ImageDraw.Draw(test_image)
for i in range(elements):
r = int(max_radius * random.random())
x, y = random.randint(0, width), random.randint(0, height)
color_value = random.randint(0, 255)
color = (color_value, color_value, color_value)
test_elements.append([r, x, y, color_value])
draw.ellipse((x - r, y - r, x + r, y + r), fill = color)
return test_elements, test_image
def seed_test(seeds):
test_elements = []
test_image = Image.new("RGB", (width, height), "white")
draw = ImageDraw.Draw(test_image)
print seeds, len(seeds)
for i in range(elements):
r = int(seeds[i, 0] + random.random() - 0.5)
x, y = seeds[i, 1] + random.randint(-5, 5), seeds[i, 2] + random.randint(-5, 5)
color_value = seeds[i, 3] + random.randint(-5, 5)
color = (color_value, color_value, color_value)
test_elements.append([r, x, y, color_value])
draw.ellipse((x - r, y - r, x + r, y + r), fill = color)
return test_elements, test_image
def grayscale(image):
return image.convert("LA")
def fitness(source, generated):
fitness = 0
for i in range(height - 1):
for j in range(width - 1):
r1, g1, b1 = source.getpixel((j, i))
r2, g2, b2 = generated.getpixel((j, i))
deltaRed = r1 - r2
deltaGreen = g1 - g2
deltaBlue = b1 - b2
pixelFitness = deltaRed ** 2 + deltaGreen ** 2 + deltaBlue ** 2
fitness += pixelFitness
return fitness
def generate():
samples = []
scores = [0] * reproduce
for i in range(population):
params, test_image = random_test()
fitness_score = fitness(optimal, test_image)
if fitness_score > scores[-1]:
scores[-1] = fitness_score
scores = sorted(scores)
samples.append(params)
for generation in range(generations):
seeds = numpy.array(copy.deepcopy(samples))[0]
samples = []
scores = [0] * reproduce
for i in range(population):
params, test_image = seed_test(seeds[:random.randint(0, reproduce)])
fitness_score = fitness(optimal, test_image)
if fitness_score > scores[-1]:
scores[-1] = fitness_score
scores = sorted(scores)
samples.append(params)
for each in samples:
print each
if __name__ == "__main__":
generate()
The source image can be found here.
What does the error mean?
you have 1000 elements (1e3) and 22 seeds (indexes 0 - 21), so when you try to get the item seeds[22, 0] in following loop, the index is out of range:
for i in range(elements):
r = int(seeds[i, 0] ...
I suspect tha what you need to do is:
for i in range(len(seeds)):
...
In your code you are setting the global elements to 100, why not set it to len(elements)? At present if there are less that 100 seed valuers the algorithm id guaranteed to fail in the way you describe.
A problem with your current solution attempt is the fact that much of the "coupling" between the various functions takes place through global variables. In Python we like to say "implicit is better than explicit", and best software engineering practice would be to pass the data explicitly to those functions that use it.

Categories

Resources