exponential decay fitting - python

I am trying to fit some data that are distributed in the time following an exponential decay. I tried to follow some fitting examples on the web, but my code doesn't fit the data. Only a straight line results from the fit. Maybe there is something wrong with the initial parameters? Until now I have only used gaussian and line fits, using the same method, that maybe is not correct for this case.
The code take the data from the web, so it is directly executable.
Question: why doesn't the code result in any fit?
Many thanks in advance.
#!/usr/bin/env python
import pyfits, os, re, glob, sys
from scipy.optimize import leastsq
from numpy import *
from pylab import *
from scipy import *
rc('font',**{'family':'serif','serif':['Helvetica']})
rc('ps',usedistiller='xpdf')
rc('text', usetex=True)
#------------------------------------------------------
tmin = 56200
tmax = 56249
data=pyfits.open('http://heasarc.gsfc.nasa.gov/docs/swift/results/transients/weak/GX304-1.orbit.lc.fits')
time = data[1].data.field(0)/86400. + data[1].header['MJDREFF'] + data[1].header['MJDREFI']
rate = data[1].data.field(1)
error = data[1].data.field(2)
data.close()
cond = ((time > 56210) & (time < 56225))
time = time[cond]
rate = rate[cond]
error = error[cond]
right_exp = lambda p, x: p[0]*exp(-p[1]*x)
err = lambda p, x, y:(right_exp(p, x) -y)
v0= [0.20, 56210.0, 1]
out = leastsq(err, v0[:], args = (time, rate), maxfev=100000, full_output=1)
v = out[0] #fit parameters out
xxx = arange(min(time), max(time), time[1] - time[0])
ccc = right_exp(v, xxx)
fig = figure(figsize = (9, 9)) #make a plot
ax1 = fig.add_subplot(111)
ax1.plot(time, rate, 'g.') #spectrum
ax1.plot(xxx, ccc, 'b-') #fitted spectrum
savefig("right exp.png")
axis([tmin-10, tmax, -0.00, 0.45])

Your problem is ill conditioned because your array times contains big numbers that when used in exp(-a*time) are giving values close to 0., which tricks the err function because your rate array contains small values also close to 0., leading to small errors. In other words, a high a in the exponential function gives a good solution.
To fix that you can:
change your decay function to include an initial time:
exp(-a*(time-time0))
change your input data to start from a smaller number:
time -= time.min()
For both options you have to change the initial guess v0, e.g. v0=[0.,0.]. The first solution seems more robust and you do not have to manage changes in your time array. A good initial guess for time0 is time.min():
right_exp = lambda p, x: p[0]*exp(-p[1]*(x-p[2]))
err = lambda p, x, y:(right_exp(p, x) -y)
v0= [0., 0., time.min() ]
out = leastsq(err, v0, args = (time, rate))
v = out[0] #fit parameters out
xxx = arange(min(time), max(time), time[1] - time[0])
ccc = right_exp(v, xxx)
fig = figure(figsize = (9, 9)) #make a plot
ax1 = fig.add_subplot(111)
ax1.plot(time, rate, 'g.') #spectrum
ax1.plot(xxx, ccc, 'b-') #fitted spectrum
fig.show()
Giving:
Still, the final results are depending on v0, e.g. with v0=[1.,1.,time.min()] it decays too fast and does not find the optimum.

Related

The result of least_squares is different depending on the environment

The acquisition channel of scipy and the same version are used.
The result of least_squares is different depending on the environment.
Differences in the environment, the PC is different.
version:1.9.1 py39h316f440_0
channel:conda-forge
environment:windows
I've attached the source code I ran.
If the conditions are the same except for the environment, I would like to get the same results.
Why different causes? How can I do that?
thank you.
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy.integrate import odeint
from scipy.optimize import least_squares
import random
random.seed(134)
import numpy as np
np.random.seed(134)
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy.integrate import odeint
from scipy.optimize import least_squares
def report_params(fit_params_values, fit_param_names):
for each in range(len(fit_param_names)):
print(fit_param_names[each], 'is', fit_params_values[each])
# define your modules
def pCon1():
# This is the module for a specific insubstatiation of a constituitive promoter
# the input is nothing
# the output is a protein production amount per time unit
pCon1_production_rate = 100
return pCon1_production_rate
def pLux1(LuxR, AHL):
# This is the module for a specific insubstatiation of a lux promoter
# the input is a LuxR amount and an AHL amount
# the output is a protein production amount per time unit
# For every promoter there is some function that determines what the promoter's
# maximal and basal expression are based on the amount of transcriptional factor
# is floating around in the cell. These numbers are empircally determined, and
# for demonstration purposes are fictionally and arbitrarily filled in here.
# These functions take the form of hill functions.
basal_n = 2
basal_basal = 2
basal_max = 2
basal_kd = 2
basal_expression_rate = basal_basal + (basal_max * (LuxR**basal_n / (LuxR**basal_n + basal_kd)))
max_n = 2
max_max = 2
max_kd = 2
maximal_expression_rate = (LuxR**max_n / (LuxR**max_n + max_kd))
pLux1_n = 2
pLux1_kd = 10
pLux1_production_rate = basal_expression_rate + maximal_expression_rate*(AHL**pLux1_n / (pLux1_kd + AHL**pLux1_n))
return pLux1_production_rate
def simulation_set_of_equations(y, t, *args):
# Args are strictly for parameters we want to eventually estimate.
# Everything else must be hardcoded below. Sorry for the convience.
# Unpack your parameters
k_pCon_express = args[0] # A summation of transcription and translation from a pCon promoter
k_pLux_express = args[1] # A summation of transcription and translation from a pLux promoter
k_loss = args[2] # A summation of dilution and degredation
# Unpack your current amount of each species
LuxR, GFP, AHL = y
# Determine the change in each species
dLuxR = pCon1() - k_loss*LuxR
dGFP = pLux1(LuxR, AHL)*k_pLux_express - k_loss*GFP
dAHL = 0 # for now we're assuming AHL was added exogenously and never degrades
# Return the change in each species; make sure same order as your init values
# scipy.odeint will take these values and apply them to the current value of each species in the next time step for you
return [dLuxR, dGFP, dAHL]
# Parameters
k_pCon_express = 101
k_pLux_express = 50
k_loss = 0.1
params = (k_pCon_express, k_pLux_express, k_loss)
param_names = ['k_pCon_express', 'k_pLux_express', 'k_loss'] # somehow this is honestly necessary in Python?!
# Initial Conditions
# LuxR, GFP, AHL
init_P = [1000, 0, 11]
# Timesteps
n_steps = 500
t = np.linspace(0, 30, n_steps)
num_P = odeint(simulation_set_of_equations, init_P, t, args = (params))
plt.plot(t, num_P[:,0], c='b', label = 'LuxR')
plt.plot(t, num_P[:,1], c='g', label = 'GFP')
plt.plot(t, num_P[:,2], c='r', label = 'AHL')
plt.xlabel('Time')
plt.ylabel('Concentration')
plt.legend(loc = 'best')
plt.grid()
plt.yscale('log')
plt.show()
noise = np.random.normal(0, 10, num_P.shape)
exp_P = num_P + noise
exp_t = t[::10]
exp_P = exp_P[::10]
# Create experimental data. Just take the regular simulation data and add some gaussian noise to it.
def residuals(params):
params = tuple(params)
sim_P = odeint(simulation_set_of_equations, init_P, exp_t, args = params)
res = sim_P - exp_P
return res.flatten()
initial_guess = (100, 100, 100)
low_bounds = [0, 0, 0]
up_bounds = [1000, 1000, 1000]
fitted_params = least_squares(residuals, initial_guess, bounds=(low_bounds, up_bounds)).x
# small reminder: .x is the fitted parameters attribute of the least_squares output
# With least_squares function, unlike, say, curve_fit, it does not compute the covariance matrix for you
# TODO calculate standard deviation of parameter estimation
# (will this ever be used other than sanity checking?)
print(params)
report_params(fitted_params, param_names)
(101, 50, 0.1)
k_pCon_express is 100.0
k_pLux_express is 49.9942246627
k_loss is 0.100037839987
plt.plot(t, odeint(simulation_set_of_equations, init_P, t, args = tuple(params))[:,1], c='r', label='GFP - Given Param Simulation')
plt.scatter(exp_t, exp_P[:,1], c='b', label='GFP - Fake Experimental Data')
plt.plot(t, odeint(simulation_set_of_equations, init_P, t, args = tuple(fitted_params))[:,1], c='g', label='GFP - Fitted Param Simlulation')
plt.legend(loc = 'best')
plt.xlabel('Time')
plt.ylabel('Concentration')
plt.grid()
plt.yscale('log')
plt.show()

Curve_Fit not accurate

i tried to fit very fluctual data over time as good as possible. So first i smoothed the data which is working fine. The smoothed data I get from this should further be represented from a fit to get out more of the peaks. As you see in the code I want to use an log-tanh function to fit the data. I am well aware that this problem accured in some of the threads already, but I tried them already and the data is also not very small or very big which i know can also cause problems.
The polynomial fit i tried works also pretty good as you see, but it does not eliminate all the wavy values. They cause problems for the following derivative which is very bad.
import tkinter as tk
from tkinter import filedialog
import numpy as np
import scipy.signal
from scipy.optimize import curve_fit
from numpy import diff
import matplotlib.pyplot as plt
from lmfit.models import StepModel, LinearModel
def loghypfunc(x, A, B, C, D, E):
return A*np.log(1+x)+B*np.tanh(C*x)+D*x+E
def expfunc(t, c0, c1, c2, c3):
return c0+c1*t-c2*np.exp(-c3*t)
def expdecay(x, a, b, c):
return a * np.exp(-b * x) + c
path="C:/Users/Sammy/Documents/Masterarbeit WT/CSM und Kriechdaten/Kriechen/Creep_10mN_00008_LC_20210406_2121_DYN.txt"
dataFile = np.loadtxt(path, delimiter='\t', skiprows=2, usecols=(0, 1, 2, 3, 29, 30), dtype=float)
num_rows, num_cols = dataFile.shape
# time column
time = dataFile[:, [0]].transpose()
time = time.flatten()
refTime = time[0] # get first time in column (reference)
# genullte Testzeit
timeNull = time - refTime
print("time", time)
flatTimeNull = timeNull.flatten() # jetzt ein 1D array (one row)
##################################################################################
# indent displacement column
indentDis = dataFile[:, [4]].transpose()
indentDis = indentDis.flatten()
indentDis = indentDis - indentDis[0]
# the indendt data has to be smoothed so there is not such a big fluctuation
indentSmooth = scipy.signal.savgol_filter(indentDis, 2001, 3)
# null the indent Smooth data
indentSmooth_Null = indentSmooth - indentSmooth[0]
hind_Smooth_flat = indentSmooth_Null.flatten() # jetzt ein 1D array
print('indent smooth', indentSmooth)
######################################################################
p0 = [100, 0.1, 100, 0.1]
c, cov = curve_fit(expfunc, time, indentSmooth, p0)
y_indent = expfunc(indentSmooth, *c)
p0 = [70, 0.5, 50, 0.1, 100]
popt, pcov = curve_fit(loghypfunc, time, indentSmooth, p0, maxfev = 10000)
y_indentTan = loghypfunc(indentSmooth, *popt)
modelh_t = np.poly1d(np.polyfit(time, indentSmooth, 8))
plt.plot(time, indentSmooth, 'r', label="Data smoothed")
plt.scatter(time, modelh_t(time), s=0.1, label="Polyfit")
plt.plot(time, y_indentTan, label="Curve fit Tangens function")
plt.plot(time, y_indent, label="Curve fit exp function")
plt.legend(loc="lower right")
plt.xlabel("time")
plt.ylabel("indent")
plt.show()
These are the two arrays i get the data from
time [ 6.299596 6.349592 6.399589 ... 608.0109 608.060897 608.110894]
indent smooth [120.81411822 121.07093706 121.32748184 ... 476.78825661 476.89357473 476.99915287]
Here the plots
Plots
The question for me now is how to fix it. Is it because of the false optimizied parameters to fit? But python should do that automatic sufficiently good i guess?
My second guess was that the data is timed to compact along this axes, as the array is about 12000 values long. Could this be a reason?
I would be very grateful for any kind of advices regarding the fits.
Regards
Hndrx

Spline in 3D can not be differentiated due to an AttributeError

I am trying to fit a smoothing B-spline to some data and I found this very helpful post on here. However, I not only need the spline, but also its derivatives, so I tried to add the following code to the example:
tck_der = interpolate.splder(tck, n=1)
x_der, y_der, z_der = interpolate.splev(u_fine, tck_der)
For some reason this does not seem to work due to some data type issues. I get the following traceback:
Traceback (most recent call last):
File "interpolate_point_trace.py", line 31, in spline_example
tck_der = interpolate.splder(tck, n=1)
File "/home/user/anaconda3/lib/python3.7/site-packages/scipy/interpolate/fitpack.py", line 657, in splder
return _impl.splder(tck, n)
File "/home/user/anaconda3/lib/python3.7/site-packages/scipy/interpolate/_fitpack_impl.py", line 1206, in splder
sh = (slice(None),) + ((None,)*len(c.shape[1:]))
AttributeError: 'list' object has no attribute 'shape'
The reason for this seems to be that the second argument of the tck tuple contains a list of numpy arrays. I thought turning the input data to be a numpy array as well would help, but it does not change the data types of tck.
Does this behavior reflect an error in scipy, or is the input malformed?
I tried manually turning the list into an array:
tck[1] = np.array(tck[1])
but this (which didn't surprise me) also gave an error:
ValueError: operands could not be broadcast together with shapes (0,8) (7,1)
Any ideas of what the problem could be? I have used scipy before and on 1D splines the splder function works just fine, so I assume it has something to do with the spline being a line in 3D.
------- edit --------
Here is a minimum working example:
import numpy as np
import matplotlib.pyplot as plt
from scipy import interpolate
from mpl_toolkits.mplot3d import Axes3D
total_rad = 10
z_factor = 3
noise = 0.1
num_true_pts = 200
s_true = np.linspace(0, total_rad, num_true_pts)
x_true = np.cos(s_true)
y_true = np.sin(s_true)
z_true = s_true / z_factor
num_sample_pts = 80
s_sample = np.linspace(0, total_rad, num_sample_pts)
x_sample = np.cos(s_sample) + noise * np.random.randn(num_sample_pts)
y_sample = np.sin(s_sample) + noise * np.random.randn(num_sample_pts)
z_sample = s_sample / z_factor + noise * np.random.randn(num_sample_pts)
tck, u = interpolate.splprep([x_sample, y_sample, z_sample], s=2)
x_knots, y_knots, z_knots = interpolate.splev(tck[0], tck)
u_fine = np.linspace(0, 1, num_true_pts)
x_fine, y_fine, z_fine = interpolate.splev(u_fine, tck)
# this is the part of the code I inserted: the line under this causes the crash
tck_der = interpolate.splder(tck, n=1)
x_der, y_der, z_der = interpolate.splev(u_fine, tck_der)
# end of the inserted code
fig2 = plt.figure(2)
ax3d = fig2.add_subplot(111, projection='3d')
ax3d.plot(x_true, y_true, z_true, 'b')
ax3d.plot(x_sample, y_sample, z_sample, 'r*')
ax3d.plot(x_knots, y_knots, z_knots, 'go')
ax3d.plot(x_fine, y_fine, z_fine, 'g')
fig2.show()
plt.show()
Stumbled into the same problem...
I circumvented the error by using interpolate.splder(tck, n=1) and instead used interpolate.splev(spline_ev, tck, der=1) which returns the derivatives at the points spline_ev (see Scipy Doku).
If you need the spline I think you can then use interpolate.splprep() again.
In total something like:
import numpy as np
from scipy import interpolate
import matplotlib.pyplot as plt
points = np.random.rand(10,2) * 10
(tck, u), fp, ier, msg = interpolate.splprep(points.T, s=0, k=3, full_output=True)
spline_ev = np.linspace(0.0, 1.0, 100, endpoint=True)
spline_points = interpolate.splev(spline_ev, tck)
# Calculate derivative
spline_der_points = interpolate.splev(spline_ev, tck, der=1)
spline_der = interpolate.splprep(spline_der_points.T, s=0, k=3, full_output=True)
# Plot the data and derivative
fig = plt.figure()
plt.plot(points[:,0], points[:,1], '.-', label="points")
plt.plot(spline_points[0], spline_points[1], '.-', label="tck")
plt.plot(spline_der_points[0], spline_der_points[1], '.-', label="tck_der")
# Show tangent
plt.arrow(spline_points[0][23]-spline_der_points[0][23], spline_points[1][23]-spline_der_points[1][23], 2.0*spline_der_points[0][23], 2.0*spline_der_points[1][23])
plt.legend()
plt.show()
EDIT:
I also opened an Issue on Github and according to ev-br the usage of interpolate.splprep is depreciated and one should use make_interp_spline / BSpline instead.
As noted in other answers, splprep output is incompatible with splder, but is compatible with splev. And the latter can evaluate the derivatives.
However, for interpolation, there is an alternative approach, which avoids splprep altogether. I'm basically copying a reply on the SciPy issue tracker (https://github.com/scipy/scipy/issues/10389):
Here's an example of replicating the splprep outputs. First let's make sense out of the splprep output:
# start with the OP example
import numpy as np
from scipy import interpolate
points = np.random.rand(10,2) * 10
(tck, u), fp, ier, msg = interpolate.splprep(points.T, s=0, k=3, full_output=True)
# check the meaning of the `u` array: evaluation of the spline at `u`
# gives back the original points (up to a list/transpose)
xy = interpolate.splev(u, tck)
xy = np.asarray(xy)
np.allclose(xy.T, points)
Next, let's replicate it without splprep. First, build the u array: the curve is represented parametrically, and u is essentially an approximation for the arc length. Other parametrizations are possible, but here let's stick to what splprep does. Translating the pseudocode from the doc page, https://docs.scipy.org/doc/scipy/reference/generated/scipy.interpolate.splprep.html
vv = np.sum((points[1:, :] - points[:-1, :])**2, axis=1)
vv = np.sqrt(vv).cumsum()
vv/= vv[-1]
vv = np.r_[0, vv]
# check:
np.allclose(u, vv)
Now, interpolate along the parametric curve: points vs vv:
spl = interpolate.make_interp_spline(vv, points)
# check spl.t vs knots from splPrep
spl.t - tck[0]
The result, spl, is a BSpline object which you can evaluate, differentiate etc in a usual way:
np.allclose(points, spl(vv))
# differentiate
spl_derivative = spl.derivative(vv)

Fit the gamma distribution only to a subset of the samples

I have the histogram of my input data (in black) given in the following graph:
I'm trying to fit the Gamma distribution but not on the whole data but just to the first curve of the histogram (the first mode). The green plot in the previous graph corresponds to when I fitted the Gamma distribution on all the samples using the following python code which makes use of scipy.stats.gamma:
img = IO.read(input_file)
data = img.flatten() + abs(np.min(img)) + 1
# calculate dB positive image
img_db = 10 * np.log10(img)
img_db_pos = img_db + abs(np.min(img_db))
data = img_db_pos.flatten() + 1
# data histogram
n, bins, patches = plt.hist(data, 1000, normed=True)
# slice histogram here
# estimation of the parameters of the gamma distribution
fit_alpha, fit_loc, fit_beta = gamma.fit(data, floc=0)
x = np.linspace(0, 100)
y = gamma.pdf(x, fit_alpha, fit_loc, fit_beta)
print '(alpha, beta): (%f, %f)' % (fit_alpha, fit_beta)
# plot estimated model
plt.plot(x, y, linewidth=2, color='g')
plt.show()
How can I restrict the fitting only to the interesting subset of this data?
Update1 (slicing):
I sliced the input data by keeping only values below the max of the previous histogram, but the results were not really convincing:
This was achieved by inserting the following code below the # slice histogram here comment in the previous code:
max_data = bins[np.argmax(n)]
data = data[data < max_data]
Update2 (scipy.optimize.minimize):
The code below shows how scipy.optimize.minimize() is used to minimize an energy function to find (alpha, beta):
import matplotlib.pyplot as plt
import numpy as np
from geotiff.io import IO
from scipy.stats import gamma
from scipy.optimize import minimize
def truncated_gamma(x, max_data, alpha, beta):
gammapdf = gamma.pdf(x, alpha, loc=0, scale=beta)
norm = gamma.cdf(max_data, alpha, loc=0, scale=beta)
return np.where(x < max_data, gammapdf / norm, 0)
# read image
img = IO.read(input_file)
# calculate dB positive image
img_db = 10 * np.log10(img)
img_db_pos = img_db + abs(np.min(img_db))
data = img_db_pos.flatten() + 1
# data histogram
n, bins = np.histogram(data, 100, normed=True)
# using minimize on a slice data below max of histogram
max_data = bins[np.argmax(n)]
data = data[data < max_data]
data = np.random.choice(data, 1000)
energy = lambda p: -np.sum(np.log(truncated_gamma(data, max_data, *p)))
initial_guess = [np.mean(data), 2.]
o = minimize(energy, initial_guess, method='SLSQP')
fit_alpha, fit_beta = o.x
# plot data histogram and model
x = np.linspace(0, 100)
y = gamma.pdf(x, fit_alpha, 0, fit_beta)
plt.hist(data, 30, normed=True)
plt.plot(x, y, linewidth=2, color='g')
plt.show()
The algorithm above converged for a subset of data, and the output in o was:
x: array([ 16.66912781, 6.88105559])
But as can be seen on the screenshot below, the gamma plot doesn't fit the histogram:
You can use a general optimization tool such as scipy.optimize.minimize to fit a truncated version of the desired function, resulting in a nice fit:
First, the modified function:
def truncated_gamma(x, alpha, beta):
gammapdf = gamma.pdf(x, alpha, loc=0, scale=beta)
norm = gamma.cdf(max_data, alpha, loc=0, scale=beta)
return np.where(x<max_data, gammapdf/norm, 0)
This selects values from the gamma distribution where x < max_data, and zero elsewhere. The np.where part is not actually important here, because the data is exclusively to the left of max_data anyway. The key is normalization, because varying alpha and beta will change the area to the left of the truncation point in the original gamma.
The rest is just optimization technicalities.
It's common practise to work with logarithms, so I used what's sometimes called "energy", or the logarithm of the inverse of the probability density.
energy = lambda p: -np.sum(np.log(truncated_gamma(data, *p)))
Minimize:
initial_guess = [np.mean(data), 2.]
o = minimize(energy, initial_guess, method='SLSQP')
fit_alpha, fit_beta = o.x
My output is (alpha, beta): (11.595208, 824.712481). Like the original, it is a maximum likelihood estimate.
If you're not happy with the convergence rate, you may want to
Select a sample from your rather big dataset:
data = np.random.choice(data, 10000)
Try different algorithms using the method keyword argument.
Some optimization routines output a representation of the inverse hessian, which is useful for uncertainty estimation. Enforcement of nonnegativity for the parameters may also be a good idea.
A log-scaled plot without truncation shows the entire distribution:
Here's another possible approach using a manually created dataset in excel that more or less matched the plot given.
Raw Data
Outline
Imported data into a Pandas dataframe.
Mask the indices after the
max response index.
Create a mirror image of the remaining data.
Append the mirror image while leaving a buffer of empty space.
Fit the desired distribution to the modified data. Below I do a normal fit by the method of moments and adjust the amplitude and width.
Working Script
# Import data to dataframe.
df = pd.read_csv('sample.csv', header=0, index_col=0)
# Mask indices after index at max Y.
mask = df.index.values <= df.Y.argmax()
df = df.loc[mask, :]
scaled_y = 100*df.Y.values
# Create new df with mirror image of Y appended.
sep = 6
app_zeroes = np.append(scaled_y, np.zeros(sep, dtype=np.float))
mir_y = np.flipud(scaled_y)
new_y = np.append(app_zeroes, mir_y)
# Using Scipy-cookbook to fit a normal by method of moments.
idxs = np.arange(new_y.size) # idxs=[0, 1, 2,...,len(data)]
mid_idxs = idxs.mean() # len(data)/2
# idxs-mid_idxs is [-53.5, -52.5, ..., 52.5, len(data)/2]
scaling_param = np.sqrt(np.abs(np.sum((idxs-mid_idxs)**2*new_y)/np.sum(new_y)))
# adjust amplitude
fmax = new_y.max()*1.2 # adjusted function max to 120% max y.
# adjust width
scaling_param = scaling_param*.7 # adjusted by 70%.
# Fit normal.
fit = lambda t: fmax*np.exp(-(t-mid_idxs)**2/(2*scaling_param**2))
# Plot results.
plt.plot(new_y, '.')
plt.plot(fit(idxs), '--')
plt.show()
Result
See the scipy-cookbook fitting data page for more on fitting a normal using method of moments.

Plot periodic trajectories

I have some data of a particle moving in a corridor with closed boundary conditions.
Plotting the trajectory leads to a zig-zag trajectory.
I would like to know how to prevent plot() from connecting the points where the particle comes back to the start. Some thing like in the upper part of the pic, but without "."
The first idea I had was to find the index where the numpy array a[:-1]-a[1:] becomes positive and then plot from 0 to that index. But how would I get the index of the first occurrence of a positive element of a[:-1]-a[1:]?
Maybe there are some other ideas.
I'd go a different approach. First, I'd determine the jump points not by looking at the sign of the derivative, as probably the movement might go up or down, or even have some periodicity in it. I'd look at those points with the biggest derivative.
Second, an elegant approach to have breaks in a plot line is to mask one value on each jump. Then matplotlib will make segments automatically. My code is:
import pylab as plt
import numpy as np
xs = np.linspace(0., 100., 1000.)
data = (xs*0.03 + np.sin(xs) * 0.1) % 1
plt.subplot(2,1,1)
plt.plot(xs, data, "r-")
#Make a masked array with jump points masked
abs_d_data = np.abs(np.diff(data))
mask = np.hstack([ abs_d_data > abs_d_data.mean()+3*abs_d_data.std(), [False]])
masked_data = np.ma.MaskedArray(data, mask)
plt.subplot(2,1,2)
plt.plot(xs, masked_data, "b-")
plt.show()
And gives us as result:
The disadvantage of course is that you lose one point at each break - but with the sampling rate you seem to have I guess you can trade this in for simpler code.
To find where the particle has crossed the upper boundary, you can do something like this:
>>> import numpy as np
>>> a = np.linspace(0, 10, 50) % 5
>>> a = np.linspace(0, 10, 50) % 5 # some sample data
>>> np.nonzero(np.diff(a) < 0)[0] + 1
array([25, 49])
>>> a[24:27]
array([ 4.89795918, 0.10204082, 0.30612245])
>>> a[48:]
array([ 4.79591837, 0. ])
>>>
np.diff(a) calculates the discrete difference of a, while np.nonzero finds where the condition np.diff(a) < 0 is negative, i.e., the particle has moved downward.
To avoid the connecting line you will have to plot by segments.
Here's a quick way to plot by segments when the derivative of a changes sign:
import numpy as np
a = np.linspace(0, 20, 50) % 5 # similar to Micheal's sample data
x = np.arange(50) # x scale
indices = np.where(np.diff(a) < 0)[0] + 1 # the same as Micheal's np.nonzero
for n, i in enumerate(indices):
if n == 0:
plot(x[:i], a[:i], 'b-')
else:
plot(x[indices[n - 1]:i], a[indices[n - 1]:i], 'b-')
Based on Thorsten Kranz answer a version which adds points to the original data when the 'y' crosses the period. This is important if the density of data-points isn't very high, e.g. np.linspace(0., 100., 100) vs. the original np.linspace(0., 100., 1000). The x position of the curve transitions are linear interpolated. Wrapped up in a function its:
import numpy as np
def periodic2plot(x, y, period=np.pi*2.):
indexes = np.argwhere(np.abs(np.diff(y))>.5*period).flatten()
index_shift = 0
for i in indexes:
i += index_shift
index_shift += 3 # in every loop it adds 3 elements
if y[i] > .5*period:
x_transit = np.interp(period, np.unwrap(y[i:i+2], period=period), x[i:i+2])
add = np.ma.array([ period, 0., 0.], mask=[0,1,0])
else:
# interpolate needs sorted xp = np.unwrap(y[i:i+2], period=period)
x_transit = np.interp(0, np.unwrap(y[i:i+2], period=period)[::-1], x[i:i+2][::-1])
add = np.ma.array([ 0., 0., period], mask=[0,1,0])
x_add = np.ma.array([x_transit]*3, mask=[0,1,0])
x = np.ma.hstack((x[:i+1], x_add, x[i+1:]))
y = np.ma.hstack((y[:i+1], add, y[i+1:]))
return x, y
The code for comparison to the original answer of Thorsten Kranz with lower data-points density.
import matplotlib.pyplot as plt
x = np.linspace(0., 100., 100)
y = (x*0.03 + np.sin(x) * 0.1) % 1
#Thorsten Kranz: Make a masked array with jump points masked
abs_d_data = np.abs(np.diff(y))
mask = np.hstack([np.abs(np.diff(y))>.5, [False]])
masked_y = np.ma.MaskedArray(y, mask)
# Plot
plt.figure()
plt.plot(*periodic2plot(x, y, period=1), label='This answer')
plt.plot(x, masked_y, label='Thorsten Kranz')
plt.autoscale(enable=True, axis='both', tight=True)
plt.legend(loc=1)
plt.tight_layout()

Categories

Resources