I'm trying to learn more about the Laplace transform, so I've tried to implement the forward and inverse (Mellin's inverse formula) transforms in code (approximated using the trapezium rule). I would expect to get roughly the same information back out when doing the forward and inverse one after the other. However, the output values appear to have nothing to do with the input data.
CODE:
# Dependencies:
from math import ceil
from cmath import *
import numpy as np
# Constants
j = complex(0, 1)
e = exp(1).real
# Default Values
sigma_default = 0 # Real component. When 0, the result is the Fourier transform
# Forward Transform - Time Domain to Laplace Domain
def Laplace(data, is_inverse, sigma=sigma_default, frequency_stamps=None, time_stamps=None):
# Resolve empty data scenario
data = np.asarray(data)
if data.size <= 1:
return data
# Add time data if missing
if time_stamps is None:
if is_inverse is False:
time_stamps = np.arange(0, data.size)
else:
time_stamps = np.arange(0, data.size * 2)
else:
time_stamps = np.asarray(time_stamps).real
if time_stamps.size is not data.size:
time_stamps = np.arange(0, data.size)
# Add frequency stamps if missing
if frequency_stamps is None:
if is_inverse is False:
frequency_stamps = np.asarray(np.arange(0, ceil(data.size / 2))).real * 2 * pi # Added forgotten constant
else:
frequency_stamps = np.asarray(np.arange(0, ceil(data.size))).real * 2 * pi # Added forgotten constant
else:
frequency_stamps = np.asarray(frequency_stamps).real
frequency_stamps = sigma + frequency_stamps * j
# Create the vector of powers exp(1) is raised to. Also create the delta times / frequencies
if is_inverse is False:
power = -Get_Powers(time_stamps, frequency_stamps)
delta = np.diff(time_stamps)
else:
power = Get_Powers(frequency_stamps, time_stamps)
delta = np.diff(frequency_stamps)
delta = np.concatenate([[np.average(delta)], delta]) # Ensure a start value is present
# Perform a numerical approximation of the Laplace transform
laplace = data * np.power(e, power) * delta
# Trapezium rule => average 1st and last wrt zero
laplace = laplace.transpose() # Fixed bug in trapezium rule implementation
laplace[[0, -1]] *= 0.5
laplace = laplace.transpose()
laplace = np.sum(laplace, 1) # Integrate
# If inverse function, then normalise and ensure the result is real
if is_inverse is True:
laplace *= 1 / (2 * pi * j) # Scale
laplace = laplace.real # Ensure time series is real only
# Return the result
return laplace
# Used to derive the vector of powers exp(1) is to be raised to
def Get_Powers(values1, values2):
# For forward Laplace, 1 = time, 2 = frequency
# For inverse Laplace, 1 = frequency, 2 = time
power = np.ones([values1.size, values2.size])
power = (power * values2).transpose() * values1
return power
if __name__ == "__main__":
# a = [0, 1, 2, 3, 4, 5]
a = np.arange(0, 10)
b = Laplace(a, False)
c = Laplace(b, True)
print(np.asarray(a))
print(c)
EXPECTED RESULT:
[0 1 2 3 4 5 6 7 8 9]
[0 1 2 3 4 5 6 7 8 9]
ACTUAL RESULT:
[0 1 2 3 4 5 6 7 8 9]
[162. 162. 162. 162. 162. 162. 162. 162. 162. 162.]
Any ideas where I've gone awry?
EDIT 1: Added Laplace functions:
Forwards transform:
Inverse transform:
Definition of s:
Where omega is represented as frequency_stamps in my code. When sigma = 0 the system becomes the Fourier transform.
EDIT 2: Fixed two bugs. Problem still persists
Besides the two bug fixes made in the original question, there were a further 3 bugs left that I identified via Cris Luengo's suggestion to look into the conversion from the Fourier Transform into the Discrete Fourier Transform. A summary of all bug fixes is below:
Fixed a bug in how I implemented the trapezium rule.
Scaled the frequency_stamps by 2*pi to reflect the underlying circular nature of the Laplace data.
Rescaled the frequency_stamps again such that they only travel around a circle once (aka. the data is in the range 0 -> 2*pi).
Fixed a mistake where I'd assumed that there only needed to be half as many frequency points than time points. That's wrong. There should be an equal amount of both.
Allowed the passing of initial and final time series points for the inverse transform as the data otherwise gets corrupted.
Updated Code:
# Dependencies:
from cmath import *
import numpy as np
# Constants
j = complex(0, 1)
e = exp(1).real
# Default Values
sigma_default = 0.0 # Real component. When 0, the result is the Fourier transform
ends_default = np.asarray([0, 0])
# Forward Transform - Time Domain to Laplace Domain
def Laplace(data, is_inverse, sigma=sigma_default, frequency_stamps=None, time_stamps=None, ends=ends_default):
# Resolve empty data scenario
data = np.asarray(data)
if data.size <= 1:
return data
# Add time data if missing
if time_stamps is None:
time_stamps = np.arange(0, data.size) # Size doesn't change between forward and inverse
else:
time_stamps = np.asarray(time_stamps).real
if time_stamps.size is not data.size:
time_stamps = np.arange(0, data.size)
# Add frequency stamps if missing
if frequency_stamps is None:
frequency_stamps = np.asarray(np.arange(0.0, data.size)).real # Size doesn't change between forward and inverse
frequency_stamps *= 2 * pi / np.max(frequency_stamps) # Restrict the integral range to 0 -> 2pi
else:
frequency_stamps = np.asarray(frequency_stamps).real
frequency_stamps = sigma + frequency_stamps * j
# Create the vector of powers exp(1) is raised to. Also create the delta times / frequencies
if is_inverse is False:
power = -Get_Powers(time_stamps, frequency_stamps)
delta = np.diff(time_stamps)
else:
power = Get_Powers(frequency_stamps, time_stamps)
delta = np.diff(frequency_stamps)
delta = np.concatenate([[np.average(delta)], delta]) # Ensure a start value is present
# Perform a numerical approximation of the Laplace transform
laplace = data * np.power(e, power) * delta
laplace = laplace.transpose()
laplace[[0, -1]] *= 0.5 # Trapezium rule => average 1st and last wrt zero
laplace = laplace.transpose()
laplace = np.sum(laplace, 1) # Integrate
# If inverse function, then normalise and ensure the result is real
if is_inverse is True:
laplace *= 1 / (2 * pi * j) # Scale
laplace = laplace.real # Ensure time series is real only
# Correct for edge cases
laplace[0] = ends[0]
laplace[-1] = ends[-1]
# Return the result
return laplace
# Used to derive the vector of powers exp(1) is to be raised to
def Get_Powers(values1, values2):
# For forward Laplace, 1 = time, 2 = frequency
# For inverse Laplace, 1 = frequency, 2 = time
power = np.ones([values1.size, values2.size])
power = (power * values2).transpose() * values1
return power
if __name__ == "__main__":
a = np.arange(3, 13)
b = Laplace(a, False, sigma=0.5)
c = Laplace(b, True, sigma=0.5, ends=np.asarray([3, 12]))
print(np.asarray(a))
print(c)
Output
[ 3 4 5 6 7 8 9 10 11 12]
[ 3. 4. 5. 6. 7. 8. 9. 10. 11. 12.]
Thanks for the assist!
I want to compute the log-likelihood of a logistic regression model.
def sigma(x):
return 1 / (1 + np.exp(-x))
def logll(y, X, w):
""""
Parameters
y : ndarray of shape (N,)
Binary labels (either 0 or 1).
X : ndarray of shape (N,D)
Design matrix.
w : ndarray of shape (D,)
Weight vector.
"""
p = sigma(X # w)
y_1 = y # np.log(p)
y_0 = (1 - y) # (1 - np.log(1 - p))
return y_1 + y_0
logll(y, Xz, np.linspace(-5,5,D))
Applying this function results in
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:16:
RuntimeWarning: divide by zero encountered in log
app.launch_new_instance()
I would expect y_0 to be a negative float. How can I avoid this error and is there a bug somewhere in the code?
Edit 1
X # w statistics:
Max: 550.775133944
Min: -141.972597608
Sigma(max): 1.0 => Throws error in y_0 in np.log(1 - 1.0)
Sigma(min): 2.19828642169e-62
Edit 2
I also have access to this logsigma function that computes sigma in log space:
def logsigma (x):
return np.vectorize(np.log)(sigma(x))
Unfortunately, I don't find a way to rewrite y_0 then. The following is my approach but obviously not correct.
def l(y, X, w):
y_1 = np.dot(y, logsigma(X # w))
y_0 = (1 - y) # (1 - np.log(1 - logsigma(X # w)))
return y_1 + y_0
First of all, I think you've made a mistake in your log-likelihood formula: it should be a plain sum of y_0 and y_1, not sum of exponentials:
Division by zero can be caused by large negative values (I mean large by abs value) in X # w, e.g. sigma(-800) is exactly 0.0 on my machine, so the log of it results in "RuntimeWarning: divide by zero encountered in log".
Make sure you initialize your network with small values near zero and you don't have exploding gradients after several iterations of backprop.
By the way, here's the code I use for cross-entropy loss, which works also in multi-class problems:
def softmax_loss(x, y):
"""
- x: Input data, of shape (N, C) where x[i, j] is the score for the jth class
for the ith input.
- y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
0 <= y[i] < C
"""
probs = np.exp(x - np.max(x, axis=1, keepdims=True))
probs /= np.sum(probs, axis=1, keepdims=True)
N = x.shape[0]
return -np.sum(np.log(probs[np.arange(N), y])) / N
UPD: When nothing else helps, there is one more numerical trick (discussed in the comments): compute log(p+epsilon) and log(1-p+epsilon) with a small positive epsilon value. This ensures that log(0.0) never happens.
I'm trying to implement a multiclass logistic regression classifier that distinguishes between k different classes.
This is my code.
import numpy as np
from scipy.special import expit
def cost(X,y,theta,regTerm):
(m,n) = X.shape
J = (np.dot(-(y.T),np.log(expit(np.dot(X,theta))))-np.dot((np.ones((m,1))-y).T,np.log(np.ones((m,1)) - (expit(np.dot(X,theta))).reshape((m,1))))) / m + (regTerm / (2 * m)) * np.linalg.norm(theta[1:])
return J
def gradient(X,y,theta,regTerm):
(m,n) = X.shape
grad = np.dot(((expit(np.dot(X,theta))).reshape(m,1) - y).T,X)/m + (np.concatenate(([0],theta[1:].T),axis=0)).reshape(1,n)
return np.asarray(grad)
def train(X,y,regTerm,learnRate,epsilon,k):
(m,n) = X.shape
theta = np.zeros((k,n))
for i in range(0,k):
previousCost = 0;
currentCost = cost(X,y,theta[i,:],regTerm)
while(np.abs(currentCost-previousCost) > epsilon):
print(theta[i,:])
theta[i,:] = theta[i,:] - learnRate*gradient(X,y,theta[i,:],regTerm)
print(theta[i,:])
previousCost = currentCost
currentCost = cost(X,y,theta[i,:],regTerm)
return theta
trX = np.load('trX.npy')
trY = np.load('trY.npy')
theta = train(trX,trY,2,0.1,0.1,4)
I can verify that cost and gradient are returning values that are in the right dimension (cost returns a scalar, and gradient returns a 1 by n row vector), but i get the error
RuntimeWarning: divide by zero encountered in log
J = (np.dot(-(y.T),np.log(expit(np.dot(X,theta))))-np.dot((np.ones((m,1))-y).T,np.log(np.ones((m,1)) - (expit(np.dot(X,theta))).reshape((m,1))))) / m + (regTerm / (2 * m)) * np.linalg.norm(theta[1:])
why is this happening and how can i avoid this?
The proper solution here is to add some small epsilon to the argument of log function. What worked for me was
epsilon = 1e-5
def cost(X, y, theta):
m = X.shape[0]
yp = expit(X # theta)
cost = - np.average(y * np.log(yp + epsilon) + (1 - y) * np.log(1 - yp + epsilon))
return cost
You can clean up the formula by appropriately using broadcasting, the operator * for dot products of vectors, and the operator # for matrix multiplication — and breaking it up as suggested in the comments.
Here is your cost function:
def cost(X, y, theta, regTerm):
m = X.shape[0] # or y.shape, or even p.shape after the next line, number of training set
p = expit(X # theta)
log_loss = -np.average(y*np.log(p) + (1-y)*np.log(1-p))
J = log_loss + regTerm * np.linalg.norm(theta[1:]) / (2*m)
return J
You can clean up your gradient function along the same lines.
By the way, are you sure you want np.linalg.norm(theta[1:]). If you're trying to do L2-regularization, the term should be np.linalg.norm(theta[1:]) ** 2.
Cause:
This is happening because in some cases, whenever y[i] is equal to 1, the value of the Sigmoid function (theta) also becomes equal to 1.
Cost function:
J = (np.dot(-(y.T),np.log(expit(np.dot(X,theta))))-np.dot((np.ones((m,1))-y).T,np.log(np.ones((m,1)) - (expit(np.dot(X,theta))).reshape((m,1))))) / m + (regTerm / (2 * m)) * np.linalg.norm(theta[1:])
Now, consider the following part in the above code snippet:
np.log(np.ones((m,1)) - (expit(np.dot(X,theta))).reshape((m,1)))
Here, you are performing (1 - theta) when the value of theta is 1. So, that will effectively become log (1 - 1) = log (0) which is undefined.
I'm guessing your data has negative values in it. You can't log a negative.
import numpy as np
np.log(2)
> 0.69314718055994529
np.log(-2)
> nan
There are a lot of different ways to transform your data that should help, if this is the case.
def cost(X, y, theta):
yp = expit(X # theta)
cost = - np.average(y * np.log(yp) + (1 - y) * np.log(1 - yp))
return cost
The warning originates from np.log(yp) when yp==0 and in np.log(1 - yp) when yp==1. One option is to filter out these values, and not to pass them into np.log. The other option is to add a small constant to prevent the value from being exactly 0 (as suggested in one of the comments above)
Add epsilon value[which is a miniature value] to the log value so that it won't be a problem at all.
But i am not sure if it will give accurate results or not .
I am looking for a function that takes as input two lists, and returns the Pearson correlation, and the significance of the correlation.
You can have a look at scipy.stats:
from pydoc import help
from scipy.stats.stats import pearsonr
help(pearsonr)
>>>
Help on function pearsonr in module scipy.stats.stats:
pearsonr(x, y)
Calculates a Pearson correlation coefficient and the p-value for testing
non-correlation.
The Pearson correlation coefficient measures the linear relationship
between two datasets. Strictly speaking, Pearson's correlation requires
that each dataset be normally distributed. Like other correlation
coefficients, this one varies between -1 and +1 with 0 implying no
correlation. Correlations of -1 or +1 imply an exact linear
relationship. Positive correlations imply that as x increases, so does
y. Negative correlations imply that as x increases, y decreases.
The p-value roughly indicates the probability of an uncorrelated system
producing datasets that have a Pearson correlation at least as extreme
as the one computed from these datasets. The p-values are not entirely
reliable but are probably reasonable for datasets larger than 500 or so.
Parameters
----------
x : 1D array
y : 1D array the same length as x
Returns
-------
(Pearson's correlation coefficient,
2-tailed p-value)
References
----------
http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation
The Pearson correlation can be calculated with numpy's corrcoef.
import numpy
numpy.corrcoef(list1, list2)[0, 1]
An alternative can be a native scipy function from linregress which calculates:
slope : slope of the regression line
intercept : intercept of the regression line
r-value : correlation coefficient
p-value : two-sided p-value for a hypothesis test whose null hypothesis is that the slope is zero
stderr : Standard error of the estimate
And here is an example:
a = [15, 12, 8, 8, 7, 7, 7, 6, 5, 3]
b = [10, 25, 17, 11, 13, 17, 20, 13, 9, 15]
from scipy.stats import linregress
linregress(a, b)
will return you:
LinregressResult(slope=0.20833333333333337, intercept=13.375, rvalue=0.14499815458068521, pvalue=0.68940144811669501, stderr=0.50261704627083648)
If you don't feel like installing scipy, I've used this quick hack, slightly modified from Programming Collective Intelligence:
def pearsonr(x, y):
# Assume len(x) == len(y)
n = len(x)
sum_x = float(sum(x))
sum_y = float(sum(y))
sum_x_sq = sum(xi*xi for xi in x)
sum_y_sq = sum(yi*yi for yi in y)
psum = sum(xi*yi for xi, yi in zip(x, y))
num = psum - (sum_x * sum_y/n)
den = pow((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n), 0.5)
if den == 0: return 0
return num / den
The following code is a straight-up interpretation of the definition:
import math
def average(x):
assert len(x) > 0
return float(sum(x)) / len(x)
def pearson_def(x, y):
assert len(x) == len(y)
n = len(x)
assert n > 0
avg_x = average(x)
avg_y = average(y)
diffprod = 0
xdiff2 = 0
ydiff2 = 0
for idx in range(n):
xdiff = x[idx] - avg_x
ydiff = y[idx] - avg_y
diffprod += xdiff * ydiff
xdiff2 += xdiff * xdiff
ydiff2 += ydiff * ydiff
return diffprod / math.sqrt(xdiff2 * ydiff2)
Test:
print pearson_def([1,2,3], [1,5,7])
returns
0.981980506062
This agrees with Excel, this calculator, SciPy (also NumPy), which return 0.981980506 and 0.9819805060619657, and 0.98198050606196574, respectively.
R:
> cor( c(1,2,3), c(1,5,7))
[1] 0.9819805
EDIT: Fixed a bug pointed out by a commenter.
You can do this with pandas.DataFrame.corr, too:
import pandas as pd
a = [[1, 2, 3],
[5, 6, 9],
[5, 6, 11],
[5, 6, 13],
[5, 3, 13]]
df = pd.DataFrame(data=a)
df.corr()
This gives
0 1 2
0 1.000000 0.745601 0.916579
1 0.745601 1.000000 0.544248
2 0.916579 0.544248 1.000000
Rather than rely on numpy/scipy, I think my answer should be the easiest to code and understand the steps in calculating the Pearson Correlation Coefficient (PCC) .
import math
# calculates the mean
def mean(x):
sum = 0.0
for i in x:
sum += i
return sum / len(x)
# calculates the sample standard deviation
def sampleStandardDeviation(x):
sumv = 0.0
for i in x:
sumv += (i - mean(x))**2
return math.sqrt(sumv/(len(x)-1))
# calculates the PCC using both the 2 functions above
def pearson(x,y):
scorex = []
scorey = []
for i in x:
scorex.append((i - mean(x))/sampleStandardDeviation(x))
for j in y:
scorey.append((j - mean(y))/sampleStandardDeviation(y))
# multiplies both lists together into 1 list (hence zip) and sums the whole list
return (sum([i*j for i,j in zip(scorex,scorey)]))/(len(x)-1)
The significance of PCC is basically to show you how strongly correlated the two variables/lists are.
It is important to note that the PCC value ranges from -1 to 1.
A value between 0 to 1 denotes a positive correlation.
Value of 0 = highest variation (no correlation whatsoever).
A value between -1 to 0 denotes a negative correlation.
Pearson coefficient calculation using pandas in python:
I would suggest trying this approach since your data contains lists. It will be easy to interact with your data and manipulate it from the console since you can visualise your data structure and update it as you wish. You can also export the data set and save it and add new data out of the python console for later analysis. This code is simpler and contains less lines of code. I am assuming you need a few quick lines of code to screen your data for further analysis
Example:
data = {'list 1':[2,4,6,8],'list 2':[4,16,36,64]}
import pandas as pd #To Convert your lists to pandas data frames convert your lists into pandas dataframes
df = pd.DataFrame(data, columns = ['list 1','list 2'])
from scipy import stats # For in-built method to get PCC
pearson_coef, p_value = stats.pearsonr(df["list 1"], df["list 2"]) #define the columns to perform calculations on
print("Pearson Correlation Coefficient: ", pearson_coef, "and a P-value of:", p_value) # Results
However, you did not post your data for me to see the size of the data set or the transformations that might be needed before the analysis.
Hmm, many of these responses have long and hard to read code...
I'd suggest using numpy with its nifty features when working with arrays:
import numpy as np
def pcc(X, Y):
''' Compute Pearson Correlation Coefficient. '''
# Normalise X and Y
X -= X.mean(0)
Y -= Y.mean(0)
# Standardise X and Y
X /= X.std(0)
Y /= Y.std(0)
# Compute mean product
return np.mean(X*Y)
# Using it on a random example
from random import random
X = np.array([random() for x in xrange(100)])
Y = np.array([random() for x in xrange(100)])
pcc(X, Y)
Here's a variant on mkh's answer that runs much faster than it, and scipy.stats.pearsonr, using numba.
import numba
#numba.jit
def corr(data1, data2):
M = data1.size
sum1 = 0.
sum2 = 0.
for i in range(M):
sum1 += data1[i]
sum2 += data2[i]
mean1 = sum1 / M
mean2 = sum2 / M
var_sum1 = 0.
var_sum2 = 0.
cross_sum = 0.
for i in range(M):
var_sum1 += (data1[i] - mean1) ** 2
var_sum2 += (data2[i] - mean2) ** 2
cross_sum += (data1[i] * data2[i])
std1 = (var_sum1 / M) ** .5
std2 = (var_sum2 / M) ** .5
cross_mean = cross_sum / M
return (cross_mean - mean1 * mean2) / (std1 * std2)
This is a implementation of Pearson Correlation function using numpy:
def corr(data1, data2):
"data1 & data2 should be numpy arrays."
mean1 = data1.mean()
mean2 = data2.mean()
std1 = data1.std()
std2 = data2.std()
# corr = ((data1-mean1)*(data2-mean2)).mean()/(std1*std2)
corr = ((data1*data2).mean()-mean1*mean2)/(std1*std2)
return corr
Here is an implementation for pearson correlation based on sparse vector. The vectors here are expressed as a list of tuples expressed as (index, value). The two sparse vectors can be of different length but over all vector size will have to be same. This is useful for text mining applications where the vector size is extremely large due to most features being bag of words and hence calculations are usually performed using sparse vectors.
def get_pearson_corelation(self, first_feature_vector=[], second_feature_vector=[], length_of_featureset=0):
indexed_feature_dict = {}
if first_feature_vector == [] or second_feature_vector == [] or length_of_featureset == 0:
raise ValueError("Empty feature vectors or zero length of featureset in get_pearson_corelation")
sum_a = sum(value for index, value in first_feature_vector)
sum_b = sum(value for index, value in second_feature_vector)
avg_a = float(sum_a) / length_of_featureset
avg_b = float(sum_b) / length_of_featureset
mean_sq_error_a = sqrt((sum((value - avg_a) ** 2 for index, value in first_feature_vector)) + ((
length_of_featureset - len(first_feature_vector)) * ((0 - avg_a) ** 2)))
mean_sq_error_b = sqrt((sum((value - avg_b) ** 2 for index, value in second_feature_vector)) + ((
length_of_featureset - len(second_feature_vector)) * ((0 - avg_b) ** 2)))
covariance_a_b = 0
#calculate covariance for the sparse vectors
for tuple in first_feature_vector:
if len(tuple) != 2:
raise ValueError("Invalid feature frequency tuple in featureVector: %s") % (tuple,)
indexed_feature_dict[tuple[0]] = tuple[1]
count_of_features = 0
for tuple in second_feature_vector:
count_of_features += 1
if len(tuple) != 2:
raise ValueError("Invalid feature frequency tuple in featureVector: %s") % (tuple,)
if tuple[0] in indexed_feature_dict:
covariance_a_b += ((indexed_feature_dict[tuple[0]] - avg_a) * (tuple[1] - avg_b))
del (indexed_feature_dict[tuple[0]])
else:
covariance_a_b += (0 - avg_a) * (tuple[1] - avg_b)
for index in indexed_feature_dict:
count_of_features += 1
covariance_a_b += (indexed_feature_dict[index] - avg_a) * (0 - avg_b)
#adjust covariance with rest of vector with 0 value
covariance_a_b += (length_of_featureset - count_of_features) * -avg_a * -avg_b
if mean_sq_error_a == 0 or mean_sq_error_b == 0:
return -1
else:
return float(covariance_a_b) / (mean_sq_error_a * mean_sq_error_b)
Unit tests:
def test_get_get_pearson_corelation(self):
vector_a = [(1, 1), (2, 2), (3, 3)]
vector_b = [(1, 1), (2, 5), (3, 7)]
self.assertAlmostEquals(self.sim_calculator.get_pearson_corelation(vector_a, vector_b, 3), 0.981980506062, 3, None, None)
vector_a = [(1, 1), (2, 2), (3, 3)]
vector_b = [(1, 1), (2, 5), (3, 7), (4, 14)]
self.assertAlmostEquals(self.sim_calculator.get_pearson_corelation(vector_a, vector_b, 5), -0.0137089240555, 3, None, None)
I have a very simple and easy to understand solution for this. For two arrays of equal length, Pearson coefficient can be easily computed as follows:
def manual_pearson(a,b):
"""
Accepts two arrays of equal length, and computes correlation coefficient.
Numerator is the sum of product of (a - a_avg) and (b - b_avg),
while denominator is the product of a_std and b_std multiplied by
length of array.
"""
a_avg, b_avg = np.average(a), np.average(b)
a_stdev, b_stdev = np.std(a), np.std(b)
n = len(a)
denominator = a_stdev * b_stdev * n
numerator = np.sum(np.multiply(a-a_avg, b-b_avg))
p_coef = numerator/denominator
return p_coef
Starting in Python 3.10, the Pearson’s correlation coefficient (statistics.correlation) is directly available in the standard library:
from statistics import correlation
# a = [15, 12, 8, 8, 7, 7, 7, 6, 5, 3]
# b = [10, 25, 17, 11, 13, 17, 20, 13, 9, 15]
correlation(a, b)
# 0.1449981545806852
You may wonder how to interpret your probability in the context of looking for a correlation in a particular direction (negative or positive correlation.) Here is a function I wrote to help with that. It might even be right!
It's based on info I gleaned from http://www.vassarstats.net/rsig.html and http://en.wikipedia.org/wiki/Student%27s_t_distribution, thanks to other answers posted here.
# Given (possibly random) variables, X and Y, and a correlation direction,
# returns:
# (r, p),
# where r is the Pearson correlation coefficient, and p is the probability
# that there is no correlation in the given direction.
#
# direction:
# if positive, p is the probability that there is no positive correlation in
# the population sampled by X and Y
# if negative, p is the probability that there is no negative correlation
# if 0, p is the probability that there is no correlation in either direction
def probabilityNotCorrelated(X, Y, direction=0):
x = len(X)
if x != len(Y):
raise ValueError("variables not same len: " + str(x) + ", and " + \
str(len(Y)))
if x < 6:
raise ValueError("must have at least 6 samples, but have " + str(x))
(corr, prb_2_tail) = stats.pearsonr(X, Y)
if not direction:
return (corr, prb_2_tail)
prb_1_tail = prb_2_tail / 2
if corr * direction > 0:
return (corr, prb_1_tail)
return (corr, 1 - prb_1_tail)
You can take a look at this article. This is a well-documented example for calculating correlation based on historical forex currency pairs data from multiple files using pandas library (for Python), and then generating a heatmap plot using seaborn library.
http://www.tradinggeeks.net/2015/08/calculating-correlation-in-python/
Calculating Correlation:
Correlation - measures similarity of two different variables
Using pearson correlation
from scipy.stats import pearsonr
# final_data is the dataframe with n set of columns
pearson_correlation = final_data.corr(method='pearson')
pearson_correlation
# print correlation of n*n column
Using Spearman correlation
from scipy.stats import spearmanr
# final_data is the dataframe with n set of columns
spearman_correlation = final_data.corr(method='spearman')
spearman_correlation
# print correlation of n*n column
Using Kendall correlation
kendall_correlation=final_data.corr(method='kendall')
kendall_correlation
def correlation_score(y_true, y_pred):
"""Scores the predictions according to the competition rules.
It is assumed that the predictions are not constant.
Returns the average of each sample's Pearson correlation coefficient"""
y2 = y_pred.copy()
y2 -= y2.mean(axis=0); y2 /= y2.std(axis=0)
y1 = y_true.copy();
y1 -= y1.mean(axis=0); y1 /= y1.std(axis=0)
c = (y1*y2).mean().mean()# Correlation for rescaled matrices is just matrix product and average
return c
def pearson(x,y):
n=len(x)
vals=range(n)
sumx=sum([float(x[i]) for i in vals])
sumy=sum([float(y[i]) for i in vals])
sumxSq=sum([x[i]**2.0 for i in vals])
sumySq=sum([y[i]**2.0 for i in vals])
pSum=sum([x[i]*y[i] for i in vals])
# Calculating Pearson correlation
num=pSum-(sumx*sumy/n)
den=((sumxSq-pow(sumx,2)/n)*(sumySq-pow(sumy,2)/n))**.5
if den==0: return 0
r=num/den
return r