I am trying to implement a non parametric estimation of the KL divergence shown in this paper
Here is my code:
import numpy as np
import math
import itertools
import random
from scipy.interpolate import interp1d
def log(x):
if x > 0: return math.log(x)
else: return 0
g = lambda x, inp,N : sum(0.5 + 0.5 * np.sign(x-inp))/N
def ecdf(x,N):
out = [g(i,x,N) for i in x]
fun = interp1d(x, out, kind='linear', bounds_error = False, fill_value = (0,1))
return fun
def KL_est(x,y):
ex = min(np.diff(sorted(np.unique(x))))
ey = min(np.diff(sorted(np.unique(y))))
e = min(ex,ey) * 0.9
N = len(x)
x.sort()
y.sort()
P = ecdf(x,N)
Q = ecdf(y,N)
KL = sum(log(v) for v in ((P(x)-P(x-e))/(Q(x)-Q(x-e))) ) / N
return KL
My trouble is with scipy interp1d. I am using the function returned from interp1d to find the value of new inputs. The problem is, some of the input values are very close (10^-5 apart) and the function returns the same value for both. In my code above, Q(x) - Q(x-e) leads to a divide by zero error.
Here is some test code that reproduces the problem:
x = np.random.normal(0, 1, 10)
y = np.random.normal(0, 1, 10)
ex = min(np.diff(sorted(np.unique(x))))
ey = min(np.diff(sorted(np.unique(y))))
e = min(ex,ey) * 0.9
N = len(x)
x.sort()
y.sort()
P = ecdf(x,N)
Q = ecdf(y,N)
KL = sum(log(v) for v in ((P(x)-P(x-e))/(Q(x)-Q(x-e))) ) / N
How would I go about getting a more accurate interpolation?
As e gets small you are effectively trying to compute the ratio of derivatives of P and Q numerically. As you are finding, you run out of precision really quickly in floating point doing it this way.
An alternate approach would be to use an interpolation function that can return derivatives directly. For example, you could try scipy.interpolate.InterpolatedUnivariateSpline. You were saying kind='linear' to interp1d, so the equivalent is k=1. Once you construct it, the spline has method derivatives() that gives you all the derivatives at different points. For small values of e you could switch to using the derivative.
Related
I am working on the following code, which solves a system of coupled differential equations. I have been able to solve them, and I plotted one of them. I am curious how to compute and plot the derivative of this graph numerically (I know the derivative is given in the first function, but suppose I didn't have that). I was thinking that I could use a for-loop, but is there a faster way?
import numpy as np
from scipy.integrate import odeint
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt
import math
def hiv(x,t):
kr1 = 1e5
kr2 = 0.1
kr3 = 2e-7
kr4 = 0.5
kr5 = 5
kr6 = 100
h = x[0] # Healthy Cells -- function of time
i= x[1] #Infected Cells -- function of time
v = x[2] # Virus -- function of time
p = kr3 * h * v
dhdt = kr1 - kr2*h - p
didt = p - kr4*i
dvdt = -p -kr5*v + kr6*i
return [dhdt, didt, dvdt]
print(hiv([1e6, 0, 100], 0))
x0 = [1e6, 0, 100] #initial conditions
t = np.linspace(0,15,1000) #time in years
x = odeint(hiv, x0, t) #vector of the functions H(t), I(t), V(t)
h = x[:,0]
i = x[:,1]
v = x[:,2]
plt.semilogy(t,h)
plt.show()
I am trying to fit below mentioned two equations using python leastsq method but am not sure whether this is the right approach. First equation has incomplete gamma function in it while the second one is slightly complex, and along with an exponential function contains a term which is obtained by using a separate fitting formula.
J_mg = T_incomplete(hw/T_mag)
J_nmg = e^(-hw/T)*g(w,T)
Here g is a function of w and T and is calucated using a given fitting formula.
I am following the steps outlined in this question.
Here is what I have done
import numpy as np
from scipy.optimize import leastsq
from scipy.special import gammaincc
from scipy.special import gamma
from matplotlib.pyplot import plot
# generating data
NPTS = 10
hw = np.linspace(0.5, 10, NPTS)
j1 = np.linspace(0.001,10,NPTS)
j2 = np.linspace(0.003,10,NPTS)
T_mag = np.linspace(0.3,0.5,NPTS)
#defining functions
def calc_gaunt_factor(hw,T):
fitting_coeff= np.loadtxt('fitting_coeff.txt', skiprows=1)
#T is in KeV
#K_b = 8.6173303(50)e−5 ev/K
g = 0
gamma = 0.0136/T
theta= hw/T
A= (np.log10(gamma**2) +0.5)*0.4
B= (np.log10(theta)+1.5)*0.4
for i in range(11):
for j in range(11):
g_ij = fitting_coeff[i][j]*(A**i)*(B**j)
g = g_ij+g
return g
def j_w_mag(hw,T_mag):
order= 0.001
return np.sqrt(1/T_mag)*gamma(order)*gammaincc(order,hw/T_mag)
def j_w_nonmag(hw,T):
gamma = 0.0136/T
theta= hw/T
return np.sqrt(1/T)*np.exp((-hw)/T)*calc_gaunt_factor(hw,T)
def residual_func(T,T_mag,hw,j1,j2):
err_unmag = np.nan_to_num(j1 - j_w_nonmag(hw,T))
err_mag = np.nan_to_num(j2 - j_w_mag(hw,T_mag))
err= np.concatenate((err_unmag, err_mag))
return err
par_init = np.array([.35])
best, cov, info, message, ler = leastsq(residual_func,par_init,args=(T_mag,hw,j1,j2),full_output=True)
print("Best-Fit Parameters:")
print("T=%s" %(best[0]))
I am getting weird value for my fitting parameter, T. Is this the right approach? Thanks.
I have a varimax rotation code from wikipedia
def varimax(Phi, gamma = 1, q = 20, tol = 1e-6):
from numpy import eye, asarray, dot, sum, diag
from numpy.linalg import svd
p,k = Phi.shape
R = eye(k)
d=0
for i in xrange(q):
d_old = d
Lambda = dot(Phi, R)
u,s,vh = svd(dot(Phi.T,asarray(Lambda)**3 - (gamma/p) * dot(Lambda, diag(diag(dot(Lambda.T,Lambda))))))
R = dot(u,vh)
d = sum(s)
if d/d_old < tol: break
return dot(Phi, R)
and I use it this way:
varimax(X) ## X is a numpy array
but it returns numbers like this: 2.4243244e-15 !! that's not my expected answer
should I change other arguments? for example gamma or q??
I'm not familiar with varimax rotation
Can you post an example of what you're using as the inputs for X and what kind of outputs you're expecting?
I tested your code by fixing up the indenting in your code, like this:
from numpy import eye, asarray, dot, sum, diag
from numpy.linalg import svd
def varimax(Phi, gamma = 1, q = 20, tol = 1e-6):
p,k = Phi.shape
R = eye(k)
d=0
for i in xrange(q):
d_old = d
Lambda = dot(Phi, R)
u,s,vh = svd(dot(Phi.T,asarray(Lambda)**3 - (gamma/p) * dot(Lambda, diag(diag(dot(Lambda.T,Lambda))))))
R = dot(u,vh)
d = sum(s)
if d/d_old < tol: break
return dot(Phi, R)
And making some dummy components to test it like this:
import numpy as np
comps = np.linalg.svd(
np.random.randn(100,10),
full_matrices=False
)[0]
rot_comps = varimax(comps)
print("Original components dimension {}".format(comps.shape))
print("Component norms")
print(np.sum(comps**2, axis=0))
print("Rotated components dimension {}".format(rot_comps.shape))
print("Rotated component norms")
print(np.sum(rot_comps**2, axis=0))
The inputs and outputs are 100 x 10 arrays with unit norm, just as you'd expect.
I'm running the minimization below:
from scipy.optimize import minimize
import numpy as np
import math
import matplotlib.pyplot as plt
### objective function ###
def Rlzd_Vol1(w1, S):
L = len(S) - 1
m = len(S[0])
# Compute log returns, size (L, m)
LR = np.array([np.diff(np.log(S[:,j])) for j in xrange(m)]).T
# Compute weighted returns
w = np.array([w1, 1.0 - w1])
R = np.array([np.sum(w*LR[i,:]) for i in xrange(L)]) # size L
# Compute Realized Vol.
vol = np.std(R) * math.sqrt(260)
return vol
# stock prices
S = np.exp(np.random.normal(size=(50,2)))
### optimization ###
obj_fun = lambda w1: Rlzd_Vol1(w1, S)
w1_0 = 0.1
res = minimize(obj_fun, w1_0)
print res
### Plot objective function ###
fig_obj = plt.figure()
ax_obj = fig_obj.add_subplot(111)
n = 100
w1 = np.linspace(0.0, 1.0, n)
y_obj = np.zeros(n)
for i in xrange(n):
y_obj[i] = obj_fun(w1[i])
ax_obj.plot(w1, y_obj)
plt.show()
The objective function shows an obvious minimum (it's quadratic):
But the minimization output tells me the minimum is at 0.1, the initial point:
I cannot figure out what's going wrong. Any thoughts?
w1 is passed in as a (single entry) vector and not as scalar from the minimize routine. Try what happens if you define w1 = np.array([0.2]) and then calculate w = np.array([w1, 1.0 - w1]). You'll see you get a 2x1 matrix instead of a 2 entry vector.
To make your objective function able to handle w1 being an array you can simply put in an explicit conversion to float w1 = float(w1) as the first line of Rlzd_Vol1. Doing so I obtain the correct minimum.
Note that you might want to use scipy.optimize.minimize_scalar instead especially if you can bracket where you minimum will be.
I have another question that I was hoping someone could help me with.
I'm using the Jensen-Shannon-Divergence to measure the similarity between two probability distributions. The similarity scores appear to be correct in the sense that they fall between 1 and 0 given that one uses the base 2 logarithm, with 0 meaning that the distributions are equal.
However, I'm not sure whether there is in fact an error somewhere and was wondering whether someone might be able to say 'yes it's correct' or 'no, you did something wrong'.
Here is the code:
from numpy import zeros, array
from math import sqrt, log
class JSD(object):
def __init__(self):
self.log2 = log(2)
def KL_divergence(self, p, q):
""" Compute KL divergence of two vectors, K(p || q)."""
return sum(p[x] * log((p[x]) / (q[x])) for x in range(len(p)) if p[x] != 0.0 or p[x] != 0)
def Jensen_Shannon_divergence(self, p, q):
""" Returns the Jensen-Shannon divergence. """
self.JSD = 0.0
weight = 0.5
average = zeros(len(p)) #Average
for x in range(len(p)):
average[x] = weight * p[x] + (1 - weight) * q[x]
self.JSD = (weight * self.KL_divergence(array(p), average)) + ((1 - weight) * self.KL_divergence(array(q), average))
return 1-(self.JSD/sqrt(2 * self.log2))
if __name__ == '__main__':
J = JSD()
p = [1.0/10, 9.0/10, 0]
q = [0, 1.0/10, 9.0/10]
print J.Jensen_Shannon_divergence(p, q)
The problem is that I feel that the scores are not high enough when comparing two text documents, for instance. However, this is purely a subjective feeling.
Any help is, as always, appreciated.
Note that the scipy entropy call below is the Kullback-Leibler divergence.
See: http://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence
#!/usr/bin/env python
from scipy.stats import entropy
from numpy.linalg import norm
import numpy as np
def JSD(P, Q):
_P = P / norm(P, ord=1)
_Q = Q / norm(Q, ord=1)
_M = 0.5 * (_P + _Q)
return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))
Also note that the test case in the Question looks erred?? The sum of the p distribution does not add to 1.0.
See: http://www.itl.nist.gov/div898/handbook/eda/section3/eda361.htm
Since the Jensen-Shannon distance (distance.jensenshannon) has been included in Scipy 1.2, the Jensen-Shannon divergence can be obtained as the square of the Jensen-Shannon distance:
from scipy.spatial import distance
distance.jensenshannon([1.0/10, 9.0/10, 0], [0, 1.0/10, 9.0/10]) ** 2
# 0.5306056938642212
Get some data for distributions with known divergence and compare your results against those known values.
BTW: the sum in KL_divergence may be rewritten using the zip built-in function like this:
sum(_p * log(_p / _q) for _p, _q in zip(p, q) if _p != 0)
This does away with lots of "noise" and is also much more "pythonic". The double comparison with 0.0 and 0 is not necessary.
A general version, for n probability distributions, in python
import numpy as np
from scipy.stats import entropy as H
def JSD(prob_distributions, weights, logbase=2):
# left term: entropy of misture
wprobs = weights * prob_distributions
mixture = wprobs.sum(axis=0)
entropy_of_mixture = H(mixture, base=logbase)
# right term: sum of entropies
entropies = np.array([H(P_i, base=logbase) for P_i in prob_distributions])
wentropies = weights * entropies
sum_of_entropies = wentropies.sum()
divergence = entropy_of_mixture - sum_of_entropies
return(divergence)
# From the original example with three distributions:
P_1 = np.array([1/2, 1/2, 0])
P_2 = np.array([0, 1/10, 9/10])
P_3 = np.array([1/3, 1/3, 1/3])
prob_distributions = np.array([P_1, P_2, P_3])
n = len(prob_distributions)
weights = np.empty(n)
weights.fill(1/n)
print(JSD(prob_distributions, weights))
#0.546621319446
Explicitly following the math in the Wikipedia article:
def jsdiv(P, Q):
"""Compute the Jensen-Shannon divergence between two probability distributions.
Input
-----
P, Q : array-like
Probability distributions of equal length that sum to 1
"""
def _kldiv(A, B):
return np.sum([v for v in A * np.log2(A/B) if not np.isnan(v)])
P = np.array(P)
Q = np.array(Q)
M = 0.5 * (P + Q)
return 0.5 * (_kldiv(P, M) +_kldiv(Q, M))