The result of least_squares is different depending on the environment - python

The acquisition channel of scipy and the same version are used.
The result of least_squares is different depending on the environment.
Differences in the environment, the PC is different.
version:1.9.1 py39h316f440_0
channel:conda-forge
environment:windows
I've attached the source code I ran.
If the conditions are the same except for the environment, I would like to get the same results.
Why different causes? How can I do that?
thank you.
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy.integrate import odeint
from scipy.optimize import least_squares
import random
random.seed(134)
import numpy as np
np.random.seed(134)
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy.integrate import odeint
from scipy.optimize import least_squares
def report_params(fit_params_values, fit_param_names):
for each in range(len(fit_param_names)):
print(fit_param_names[each], 'is', fit_params_values[each])
# define your modules
def pCon1():
# This is the module for a specific insubstatiation of a constituitive promoter
# the input is nothing
# the output is a protein production amount per time unit
pCon1_production_rate = 100
return pCon1_production_rate
def pLux1(LuxR, AHL):
# This is the module for a specific insubstatiation of a lux promoter
# the input is a LuxR amount and an AHL amount
# the output is a protein production amount per time unit
# For every promoter there is some function that determines what the promoter's
# maximal and basal expression are based on the amount of transcriptional factor
# is floating around in the cell. These numbers are empircally determined, and
# for demonstration purposes are fictionally and arbitrarily filled in here.
# These functions take the form of hill functions.
basal_n = 2
basal_basal = 2
basal_max = 2
basal_kd = 2
basal_expression_rate = basal_basal + (basal_max * (LuxR**basal_n / (LuxR**basal_n + basal_kd)))
max_n = 2
max_max = 2
max_kd = 2
maximal_expression_rate = (LuxR**max_n / (LuxR**max_n + max_kd))
pLux1_n = 2
pLux1_kd = 10
pLux1_production_rate = basal_expression_rate + maximal_expression_rate*(AHL**pLux1_n / (pLux1_kd + AHL**pLux1_n))
return pLux1_production_rate
def simulation_set_of_equations(y, t, *args):
# Args are strictly for parameters we want to eventually estimate.
# Everything else must be hardcoded below. Sorry for the convience.
# Unpack your parameters
k_pCon_express = args[0] # A summation of transcription and translation from a pCon promoter
k_pLux_express = args[1] # A summation of transcription and translation from a pLux promoter
k_loss = args[2] # A summation of dilution and degredation
# Unpack your current amount of each species
LuxR, GFP, AHL = y
# Determine the change in each species
dLuxR = pCon1() - k_loss*LuxR
dGFP = pLux1(LuxR, AHL)*k_pLux_express - k_loss*GFP
dAHL = 0 # for now we're assuming AHL was added exogenously and never degrades
# Return the change in each species; make sure same order as your init values
# scipy.odeint will take these values and apply them to the current value of each species in the next time step for you
return [dLuxR, dGFP, dAHL]
# Parameters
k_pCon_express = 101
k_pLux_express = 50
k_loss = 0.1
params = (k_pCon_express, k_pLux_express, k_loss)
param_names = ['k_pCon_express', 'k_pLux_express', 'k_loss'] # somehow this is honestly necessary in Python?!
# Initial Conditions
# LuxR, GFP, AHL
init_P = [1000, 0, 11]
# Timesteps
n_steps = 500
t = np.linspace(0, 30, n_steps)
num_P = odeint(simulation_set_of_equations, init_P, t, args = (params))
plt.plot(t, num_P[:,0], c='b', label = 'LuxR')
plt.plot(t, num_P[:,1], c='g', label = 'GFP')
plt.plot(t, num_P[:,2], c='r', label = 'AHL')
plt.xlabel('Time')
plt.ylabel('Concentration')
plt.legend(loc = 'best')
plt.grid()
plt.yscale('log')
plt.show()
noise = np.random.normal(0, 10, num_P.shape)
exp_P = num_P + noise
exp_t = t[::10]
exp_P = exp_P[::10]
# Create experimental data. Just take the regular simulation data and add some gaussian noise to it.
def residuals(params):
params = tuple(params)
sim_P = odeint(simulation_set_of_equations, init_P, exp_t, args = params)
res = sim_P - exp_P
return res.flatten()
initial_guess = (100, 100, 100)
low_bounds = [0, 0, 0]
up_bounds = [1000, 1000, 1000]
fitted_params = least_squares(residuals, initial_guess, bounds=(low_bounds, up_bounds)).x
# small reminder: .x is the fitted parameters attribute of the least_squares output
# With least_squares function, unlike, say, curve_fit, it does not compute the covariance matrix for you
# TODO calculate standard deviation of parameter estimation
# (will this ever be used other than sanity checking?)
print(params)
report_params(fitted_params, param_names)
(101, 50, 0.1)
k_pCon_express is 100.0
k_pLux_express is 49.9942246627
k_loss is 0.100037839987
plt.plot(t, odeint(simulation_set_of_equations, init_P, t, args = tuple(params))[:,1], c='r', label='GFP - Given Param Simulation')
plt.scatter(exp_t, exp_P[:,1], c='b', label='GFP - Fake Experimental Data')
plt.plot(t, odeint(simulation_set_of_equations, init_P, t, args = tuple(fitted_params))[:,1], c='g', label='GFP - Fitted Param Simlulation')
plt.legend(loc = 'best')
plt.xlabel('Time')
plt.ylabel('Concentration')
plt.grid()
plt.yscale('log')
plt.show()

Related

Is it possible to specify different SPHI or SPLO limits for CVs for different parts of the horizon in GEKKO?

I would like to build a GEKKO model for dynamic schedule optimisation of a system. I am exploring GEKKO functionality with a toy problem (code attached below). I foresee that it would be required to specify different targets for some CV's for different parts of the horizon as depicted below.
I tried assigning an array to the SPLO parameter of the Level CV but it simply collapsed the SPHI and SPLO to the starting value of the CV.
I like the flexibility of using the objective function to drive the solution as opposed to 'hard' constraints. Can this be accomplished in a non-iterative implementation and if so how?
from gekko import GEKKO
import numpy as np
import json
import pandas as pd
from matplotlib import pyplot as plt
def G1_offline(timespace=100):
tk_lowlimit=[37]*100 #init low limit
tk_lowlimit[40:70]=[38]*30 #increase low limit for portion of horizon
m=GEKKO(remote=False)
#tk_lowlimit_hard=m.Param(tk_lowlimit)
rundown_schedule=[100]*timespace #init rundown schedule
rundown_schedule[40:45]=[95]*5 #adjust schedile for few points
m.time=np.linspace(0,timespace-1,timespace)
m.Unit1_Feed=m.MV(value=25,lb=0,ub=60,name='Unit1 Feed')
m.Unit2_Feed=m.MV(value=27,lb=0,ub=60,name='Unit2 Feed')
m.Fuel=m.MV(value=10,lb=0,ub=100,name='Fuel')
m.Rundown=m.MV(name='Rundown') #This is a DV
m.Efficiency=m.FV(value=0.99,lb=0.95,ub=1,name='Efficiency')
m.Rundown.value=rundown_schedule
m.Flare=m.SV(value=30,lb=0,ub=100,name='Flare')
m.TankLevel=m.CV(value=25, lb=0,ub=300,name='tklevel')
m.Consumers=m.MV(value=30,lb=0,ub=130,name='Consumers')
m.Product=m.Intermediate((m.Unit1_Feed+m.Unit2_Feed)*m.Efficiency,name='Product')
m.Balance=m.Intermediate(m.Product-m.Consumers,name='Balance')
m.Equation(m.TankLevel.dt()==m.Balance)
m.Equation(m.Flare==m.Rundown-(m.Unit1_Feed+m.Unit2_Feed+m.Fuel))
#m.Equation(m.Flare>=1)
#GLOBAL OPTIONS
m.options.IMODE=6 #control mode,dynamic control, simultaneous
m.options.NODES=2 #collocation nodes
m.options.SOLVER=1 # 1=APOPT, 2=BPOPT, 3=IPOPT
m.options.CV_TYPE=1 #2 = squared error from reference trajectory
m.options.CTRL_UNITS=3 #control time steps units (3= HOURS)
m.options.CTRL_TIME=1 #1=1 hour per time step
m.options.REQCTRLMODE=3 #3= CONTROL
#m.options.SCALING=2
m.options.RTOL=1e-6
m.options.OTOL=1e-6
#m.options.CV_WGT_START=5
m.options.CSV_WRITE=2
#MV/DV modes
m.Unit1_Feed.STATUS=1 #1 = can change
m.Unit2_Feed.STATUS=1 #1 = can change
m.Fuel.STATUS=1 #1 = can change
m.Consumers.STATUS=1 #1 = can change
m.Rundown.STATUS=0 #0 = cannot change, this is a DV
m.Efficiency.STATUS=0
m.Efficiency.FSTATUS=1
#CV Modes
m.TankLevel.STATUS=1 #1 = Control this CV
#m.Flare.STATUS=0 #0 = Do Not Control this CV
m.TankLevel.FSTATUS=1 #Allow Feedback
m.TankLevel.STATUS=1 #Control this CV
m.TankLevel.TAU=12 #Time constant for trajectory
m.TankLevel.SPHI=40 #Upper limit for trajectory
m.TankLevel.SPLO=37 #Lower limit for trajectory
m.TankLevel.WSPLO=20 #Penalty for crossing LO limit
m.TankLevel.WSPHI=20 #Penalty for crossing HI limit
m.TankLevel.TR_INIT=0 #0 -Do not re-center.
m.TankLevel.TR_OPEN=1 #Openi#ng shape of trajectory
m.Consumers.COST=-40
m.Unit1_Feed.COST=5
m.Unit2_Feed.COST=4
m.Fuel.COST=-2
#m.Flare.COST=0
m.Consumers.DCOST=15
m.Unit1_Feed.DCOST=5
m.Unit2_Feed.DCOST=5
m.Fuel.DCOST=1
m.Consumers.DMAX=10
m.Unit1_Feed.DMAX=10
m.Unit2_Feed.DMAX=8
m.Fuel.DMAX=10
m.Consumers.MV_STEP_HOR=1
m.Unit1_Feed.MV_STEP_HOR=1
m.Unit2_Feed.MV_STEP_HOR=1
m.Fuel.MV_STEP_HOR=1
m.solve(GUI=False)
with open(m.path+'//results.json') as f:
results = json.load(f)
#print(results)
results_df=pd.DataFrame(results)
print(results_df)
#results_df.to_excel(r'c:\data\toyproblem.xlsx')
fig = plt.figure(figsize=(14,6))
plt.plot(results_df['time'],results_df['tklevel'],color='red',label='Level')
plt.fill_between(x=results_df['time'],y1=results_df['tklevel.tr_lo'], y2=results_df['tklevel.tr_hi'],color='green',alpha=0.2, label='Tklevel CV bounds')
plt.xlabel('TIME')
plt.title('Controlled solution')
plt.ylabel('TankLevel')
plt.legend(bbox_to_anchor=(0.0, 1), loc='upper left', borderaxespad=0.5)
plt.minorticks_on()
plt.grid(color = 'b', linestyle = '--', linewidth = 0.5, axis='y')
plt.show()
fig = plt.figure(figsize=(14,6))
plt.plot(results_df['time'],results_df['unit1_feed'],color='red',label='Unit1')
plt.plot(results_df['time'],results_df['unit2_feed'],color='green',label='Unit2')
plt.plot(results_df['time'],results_df['consumers'],color='black',label='Consumers')
plt.plot(results_df['time'],results_df['flare'],color='orange',label='Flare')
plt.plot(results_df['time'],results_df['fuel'],color='blue',label='Fuel')
plt.plot(results_df['time'],results_df['rundown'],color='purple',label='Rundown')
plt.xlabel('TIME'), plt.ylabel('knm3/h'), plt.title('Independent variables'),
plt.legend(bbox_to_anchor=(0.0, 1), loc='upper left', borderaxespad=0.5)
plt.minorticks_on()
plt.grid(color = 'b', linestyle = '--', linewidth = 0.5, axis='y')
trj_hi=results_df['tklevel.tr_hi']
trj_lo=results_df['tklevel.tr_lo']
return m,results_df
#----main----
c1,results_df=G1_offline(100)
It is possible to customize SPHI and SPLO instead of a fixed target value. This is accomplished by redefining the CV as a difference between the current and target value. The target value can be a feedforward traj=m.Param() with the values updated each cycle of the controller with something like traj.value = [custom_setpoint]. There is an example of this approach in the Dynamic Optimization course (see bottom of the page).
# Error
e = m.CV(value=0,name='e')
m.Equation(e==v-traj)
# CV tuning
e.STATUS = 1 #add the CV to the objective
m.options.CV_TYPE = 1 #Dead-band
db = 2
e.SPHI = db #set point
e.SPLO = -db #set point
e.TR_INIT = 0 #dead-band
Some applications require a custom reference trajectory that does not fit a standard form. A custom reference trajectory is specified by creating a new error (e) variable that is the difference between the specified trajectory (sinusoidal, sawtooth, random, etc) and the model output. This error is specified as a controlled variable (CV) with an upper and lower dead-band denoted as SPHI and SPLO. The CV can also be a value of zero with a squared error objective (e.SP=0, m.options.CV_TYPE=2) to drive to a target instead of a dead-band range.
import numpy as np
from random import random
from gekko import GEKKO
import matplotlib.pyplot as plt
# initialize GEKKO model
m = GEKKO()
# time
m.time = np.linspace(0,20,41)
# constants
mass = 500
# Parameters
b = m.Param(value=50)
K = m.Param(value=0.8)
# Manipulated variable
p = m.MV(value=0, lb=-100, ub=100)
# Reference trajectory
sine = 10*np.sin(m.time/20*4*np.pi)
traj = m.Param(value=sine)
# Controlled Variable
v = m.SV(value=0,name='v')
# Error
e = m.CV(value=0,name='e')
# Equations
m.Equation(mass*v.dt() == -v*b + K*b*p)
m.Equation(e==v-traj)
m.options.IMODE = 6 # control
# MV tuning
p.STATUS = 1 #allow optimizer to change
p.DCOST = 0.1 #smooth out MV
p.DMAX = 50 #slow down change of MV
# CV tuning
e.STATUS = 1 #add the CV to the objective
m.options.CV_TYPE = 1 #Dead-band
db = 2
e.SPHI = db #set point
e.SPLO = -db #set point
e.TR_INIT = 0 #dead-band
# Solve
m.solve()
# get additional solution information
import json
with open(m.path+'//results.json') as f:
results = json.load(f)
# Plot solution
plt.figure()
plt.subplot(3,1,1)
plt.plot(m.time,p.value,'b-',lw=2,label='MV')
plt.legend(loc='best')
plt.ylabel('MV')
plt.subplot(3,1,2)
plt.plot(m.time,sine+db,'k-',label='SPHI')
plt.plot(m.time,sine-db,'k-',label='SPLO')
plt.plot(m.time,v.value,'r--',lw=2,label='CV')
plt.legend(loc='best')
plt.ylabel('CV')
plt.subplot(3,1,3)
plt.plot(m.time,results['e.tr_hi'],'k-',label='SPHI')
plt.plot(m.time,results['e.tr_lo'],'k-',label='SPLO')
plt.plot(m.time,e.value,'r--',lw=2,label='Error')
plt.legend(loc='best')
plt.ylabel('Error')
plt.xlabel('time')
plt.show()

How to increase predicted data when estimating parameters of set of complex differential equations by fitting experimental data?

I am trying to estimate the parameters of my set of ODEs in my program, always minimizing the error between my experimental data and predicted data.
The problem is, I can obtain a good prediction and a very good fit, but I can only estimate the same number of points as my experimental data, which returns a very strange output.
Can you please provide me more information in how can I obtain a more accurate set of predicted points?
Code can be found below.
from gekko import GEKKO
import numpy as np
import matplotlib.pyplot as plt
xm = np.array([0,1,2,3,4,5])
ym = np.array([2.0,1.5,np.nan,2.2,3.0,5.0])
m = GEKKO(remote=False)
m.time = xm
a = m.FV(lb=0.1,ub=2.0)
a.STATUS=1
y = m.CV(value=ym,name='y',fixed_initial=False)
y.FSTATUS=1
m.Equation(y.dt()==a*y)
m.options.IMODE = 5
m.options.SOLVER = 1
m.solve(disp=True)
print('Optimized, a = ' + str(a.value[0]))
plt.figure(figsize=(6,2))
plt.plot(xm,ym,'bo',label='Meas')
plt.plot(xm,y.value,'r-',label='Pred')
plt.ylabel('y')
plt.ylim([0,6])
plt.legend()
plt.show()
If I replace variable m.time to obtain more data predicted with :
m.time = np.linspace(0,5,30)
I get the error: raise Exception('Data arrays must have the same length, and match time discretization in dynamic problems')
Exception: Data arrays must have the same length, and match time discretization in dynamic problems
There are two options (Methods 1 and 2) that I've shown below. You can either plot the interpolating nodes to give you more resolution or create a new model for simulation.
from gekko import GEKKO
import numpy as np
import matplotlib.pyplot as plt
xm = np.array([0,1,2,3,4,5])
ym = np.array([2.0,1.5,np.nan,2.2,3.0,5.0])
m = GEKKO(remote=False)
m.time = xm
a = m.FV(lb=0.1,ub=2.0)
a.STATUS=1
y = m.CV(value=ym,name='y',fixed_initial=False)
y.FSTATUS=1
m.Equation(y.dt()==a*y)
m.options.IMODE = 5
m.options.SOLVER = 1
m.options.CSV_WRITE = 2 # For Method 1
m.options.NODES = 3 # For Method 1 (options 3-6)
m.solve(disp=True)
print('Optimized, a = ' + str(a.value[0]))
# Method 1: Plot interpolating nodes
import json
with open(m.path+'//results_all.json') as f:
results = json.load(f)
# Method 2: Re-simulate with more points
sim = GEKKO(remote=False)
ap = a.value[0]
xp = np.linspace(0,7); sim.time=xp
yp = sim.Var(y.value[0])
sim.Equation(yp.dt()==ap*yp)
sim.options.NODES = 3
sim.options.IMODE=4; sim.solve()
plt.figure(figsize=(6,2))
plt.plot(xm,ym,'bo',label='Meas')
plt.plot(xm,y.value,'gs-.',label='Pred Original')
plt.plot(results['time'],results['y'],'kx-',\
MarkerSize=10,label='Pred Method 1')
plt.plot(xp,yp,'r.--',label='Pred Method 2')
plt.ylabel('y')
plt.ylim([0,10])
plt.legend()
plt.show()
A third option is to reset the .value of the original model but that can be tedious. Instead, you can also create both the estimation and simulation models in a loop as is done with an example of Moving Horizon Estimation and Model Predictive Control that use the same model but transfer parameters between them:
# use remote=True for MacOS
mhe = GEKKO(name='tclab-mhe',remote=False)
mpc = GEKKO(name='tclab-mpc',remote=False)
# create 2 models (MHE and MPC) in one loop
for m in [mhe,mpc]:
# Parameters with bounds
m.K1 = m.FV(value=0.607,lb=0.1,ub=1.0)
m.K2 = m.FV(value=0.293,lb=0.1,ub=1.0)
m.K3 = m.FV(value=0.24,lb=0.1,ub=1.0)
m.tau12 = m.FV(value=192,lb=100,ub=200)
m.tau3 = m.FV(value=15,lb=10,ub=20)
m.Ta = m.Param(value=23.0) # degC
m.Q1 = m.MV(value=0,lb=0,ub=100,name='q1')
m.Q2 = m.MV(value=0,lb=0,ub=100,name='q2')
# Heater temperatures
m.TH1 = m.SV(value=T1m[0])
m.TH2 = m.SV(value=T2m[0])
# Sensor temperatures
m.TC1 = m.CV(value=T1m[0],name='tc1')
m.TC2 = m.CV(value=T2m[0],name='tc2')
# Temperature difference between two heaters
m.DT = m.Intermediate(m.TH2-m.TH1)
# Equations
m.Equation(m.tau12*m.TH1.dt()+(m.TH1-m.Ta)==m.K1*m.Q1+m.K3*m.DT)
m.Equation(m.tau12*m.TH2.dt()+(m.TH2-m.Ta)==m.K2*m.Q2-m.K3*m.DT)
m.Equation(m.tau3*m.TC1.dt()+m.TC1==m.TH1)
m.Equation(m.tau3*m.TC2.dt()+m.TC2==m.TH2)

Filtering 1D numpy arrays in Python

Explanation:
I have two numpy arrays: dataX and dataY, and I am trying to filter each array to reduce the noise. The image shown below shows the actual input data (blue dots) and an example of what I want it to be like(red dots). I do not need the filtered data to be as perfect as in the example but I do want it to be as straight as possible. I have provided sample data in the code.
What I have tried:
Firstly, you can see that the data isn't 'continuous', so I first divided them into individual 'segments' ( 4 of them in this example), and then applied a filter to each 'segment'. Someone suggested that I use a Savitzky-Golay filter. The full, run-able code is below:
import scipy as sc
import scipy.signal
import numpy as np
import matplotlib.pyplot as plt
# Sample Data
ydata = np.array([1,0,1,2,1,2,1,0,1,1,2,2,0,0,1,0,1,0,1,2,7,6,8,6,8,6,6,8,6,6,8,6,6,7,6,5,5,6,6, 10,11,12,13,12,11,10,10,11,10,12,11,10,10,10,10,12,12,10,10,17,16,15,17,16, 17,16,18,19,18,17,16,16,16,16,16,15,16])
xdata = np.array([1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32,33, 1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32])
# Used a diff array to find where there is a big change in Y.
# If there's a big change in Y, then there must be a change of 'segment'.
diffy = np.diff(ydata)
# Create empty numpy arrays to append values into
filteredX = np.array([])
filteredY = np.array([])
# Chose 3 to be the value indicating the change in Y
index = np.where(diffy >3)
# Loop through the array
start = 0
for i in range (0, (index[0].size +1) ):
# Check if last segment is reached
if i == index[0].size:
print xdata[start:]
partSize = xdata[start:].size
# Window length must be an odd integer
if partSize % 2 == 0:
partSize = partSize - 1
filteredDataX = sc.signal.savgol_filter(xdata[start:], partSize, 3)
filteredDataY = sc.signal.savgol_filter(ydata[start:], partSize, 3)
filteredX = np.append(filteredX, filteredDataX)
filteredY = np.append(filteredY, filteredDataY)
else:
print xdata[start:index[0][i]]
partSize = xdata[start:index[0][i]].size
if partSize % 2 == 0:
partSize = partSize - 1
filteredDataX = sc.signal.savgol_filter(xdata[start:index[0][i]], partSize, 3)
filteredDataY = sc.signal.savgol_filter(ydata[start:index[0][i]], partSize, 3)
start = index[0][i]
filteredX = np.append(filteredX, filteredDataX)
filteredY = np.append(filteredY, filteredDataY)
# Plots
plt.plot(xdata,ydata, 'bo', label = 'Input Data')
plt.plot(filteredX, filteredY, 'ro', label = 'Filtered Data')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Result')
plt.legend()
plt.show()
This is my result:
When each point is connected, the result looks as follows.
I have played around with the order, but it seems like a third order gave the best result.
I have also tried these filters, among a few others:
scipy.signal.medfilt
scipy.ndimage.filters.uniform_filter1d
But so far none of the filters I have tried were close to what I really wanted. What is the best way to filter data such as this? Looking forward to your help.
One way to get something looking close to your ideal would be clustering + linear regression.
Note that you have to provide the number of clusters and I also cheated a bit in scaling up y before clustering.
import numpy as np
from scipy import cluster, stats
ydata = np.array([1,0,1,2,1,2,1,0,1,1,2,2,0,0,1,0,1,0,1,2,7,6,8,6,8,6,6,8,6,6,8,6,6,7,6,5,5,6,6, 10,11,12,13,12,11,10,10,11,10,12,11,10,10,10,10,12,12,10,10,17,16,15,17,16, 17,16,18,19,18,17,16,16,16,16,16,15,16])
xdata = np.array([1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32,33, 1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32])
def split_to_lines(x, y, k):
yo = np.empty_like(y, dtype=float)
# get the cluster centers and the labels for each point
centers, map_ = cluster.vq.kmeans2(np.array((x, y * 2)).T.astype(float), k)
# for each cluster, use the labels to select the points belonging to
# the cluster and do a linear regression
for i in range(k):
slope, interc, *_ = stats.linregress(x[map_==i], y[map_==i])
# use the regression parameters to construct y values on the
# best fit line
yo[map_==i] = x[map_==i] * slope + interc
return yo
import pylab
pylab.plot(xdata, ydata, 'or')
pylab.plot(xdata, split_to_lines(xdata, ydata, 4), 'ob')
pylab.show()

Fit the gamma distribution only to a subset of the samples

I have the histogram of my input data (in black) given in the following graph:
I'm trying to fit the Gamma distribution but not on the whole data but just to the first curve of the histogram (the first mode). The green plot in the previous graph corresponds to when I fitted the Gamma distribution on all the samples using the following python code which makes use of scipy.stats.gamma:
img = IO.read(input_file)
data = img.flatten() + abs(np.min(img)) + 1
# calculate dB positive image
img_db = 10 * np.log10(img)
img_db_pos = img_db + abs(np.min(img_db))
data = img_db_pos.flatten() + 1
# data histogram
n, bins, patches = plt.hist(data, 1000, normed=True)
# slice histogram here
# estimation of the parameters of the gamma distribution
fit_alpha, fit_loc, fit_beta = gamma.fit(data, floc=0)
x = np.linspace(0, 100)
y = gamma.pdf(x, fit_alpha, fit_loc, fit_beta)
print '(alpha, beta): (%f, %f)' % (fit_alpha, fit_beta)
# plot estimated model
plt.plot(x, y, linewidth=2, color='g')
plt.show()
How can I restrict the fitting only to the interesting subset of this data?
Update1 (slicing):
I sliced the input data by keeping only values below the max of the previous histogram, but the results were not really convincing:
This was achieved by inserting the following code below the # slice histogram here comment in the previous code:
max_data = bins[np.argmax(n)]
data = data[data < max_data]
Update2 (scipy.optimize.minimize):
The code below shows how scipy.optimize.minimize() is used to minimize an energy function to find (alpha, beta):
import matplotlib.pyplot as plt
import numpy as np
from geotiff.io import IO
from scipy.stats import gamma
from scipy.optimize import minimize
def truncated_gamma(x, max_data, alpha, beta):
gammapdf = gamma.pdf(x, alpha, loc=0, scale=beta)
norm = gamma.cdf(max_data, alpha, loc=0, scale=beta)
return np.where(x < max_data, gammapdf / norm, 0)
# read image
img = IO.read(input_file)
# calculate dB positive image
img_db = 10 * np.log10(img)
img_db_pos = img_db + abs(np.min(img_db))
data = img_db_pos.flatten() + 1
# data histogram
n, bins = np.histogram(data, 100, normed=True)
# using minimize on a slice data below max of histogram
max_data = bins[np.argmax(n)]
data = data[data < max_data]
data = np.random.choice(data, 1000)
energy = lambda p: -np.sum(np.log(truncated_gamma(data, max_data, *p)))
initial_guess = [np.mean(data), 2.]
o = minimize(energy, initial_guess, method='SLSQP')
fit_alpha, fit_beta = o.x
# plot data histogram and model
x = np.linspace(0, 100)
y = gamma.pdf(x, fit_alpha, 0, fit_beta)
plt.hist(data, 30, normed=True)
plt.plot(x, y, linewidth=2, color='g')
plt.show()
The algorithm above converged for a subset of data, and the output in o was:
x: array([ 16.66912781, 6.88105559])
But as can be seen on the screenshot below, the gamma plot doesn't fit the histogram:
You can use a general optimization tool such as scipy.optimize.minimize to fit a truncated version of the desired function, resulting in a nice fit:
First, the modified function:
def truncated_gamma(x, alpha, beta):
gammapdf = gamma.pdf(x, alpha, loc=0, scale=beta)
norm = gamma.cdf(max_data, alpha, loc=0, scale=beta)
return np.where(x<max_data, gammapdf/norm, 0)
This selects values from the gamma distribution where x < max_data, and zero elsewhere. The np.where part is not actually important here, because the data is exclusively to the left of max_data anyway. The key is normalization, because varying alpha and beta will change the area to the left of the truncation point in the original gamma.
The rest is just optimization technicalities.
It's common practise to work with logarithms, so I used what's sometimes called "energy", or the logarithm of the inverse of the probability density.
energy = lambda p: -np.sum(np.log(truncated_gamma(data, *p)))
Minimize:
initial_guess = [np.mean(data), 2.]
o = minimize(energy, initial_guess, method='SLSQP')
fit_alpha, fit_beta = o.x
My output is (alpha, beta): (11.595208, 824.712481). Like the original, it is a maximum likelihood estimate.
If you're not happy with the convergence rate, you may want to
Select a sample from your rather big dataset:
data = np.random.choice(data, 10000)
Try different algorithms using the method keyword argument.
Some optimization routines output a representation of the inverse hessian, which is useful for uncertainty estimation. Enforcement of nonnegativity for the parameters may also be a good idea.
A log-scaled plot without truncation shows the entire distribution:
Here's another possible approach using a manually created dataset in excel that more or less matched the plot given.
Raw Data
Outline
Imported data into a Pandas dataframe.
Mask the indices after the
max response index.
Create a mirror image of the remaining data.
Append the mirror image while leaving a buffer of empty space.
Fit the desired distribution to the modified data. Below I do a normal fit by the method of moments and adjust the amplitude and width.
Working Script
# Import data to dataframe.
df = pd.read_csv('sample.csv', header=0, index_col=0)
# Mask indices after index at max Y.
mask = df.index.values <= df.Y.argmax()
df = df.loc[mask, :]
scaled_y = 100*df.Y.values
# Create new df with mirror image of Y appended.
sep = 6
app_zeroes = np.append(scaled_y, np.zeros(sep, dtype=np.float))
mir_y = np.flipud(scaled_y)
new_y = np.append(app_zeroes, mir_y)
# Using Scipy-cookbook to fit a normal by method of moments.
idxs = np.arange(new_y.size) # idxs=[0, 1, 2,...,len(data)]
mid_idxs = idxs.mean() # len(data)/2
# idxs-mid_idxs is [-53.5, -52.5, ..., 52.5, len(data)/2]
scaling_param = np.sqrt(np.abs(np.sum((idxs-mid_idxs)**2*new_y)/np.sum(new_y)))
# adjust amplitude
fmax = new_y.max()*1.2 # adjusted function max to 120% max y.
# adjust width
scaling_param = scaling_param*.7 # adjusted by 70%.
# Fit normal.
fit = lambda t: fmax*np.exp(-(t-mid_idxs)**2/(2*scaling_param**2))
# Plot results.
plt.plot(new_y, '.')
plt.plot(fit(idxs), '--')
plt.show()
Result
See the scipy-cookbook fitting data page for more on fitting a normal using method of moments.

scipy curve fitting negative value

I would like to fit a curve with curve_fit and prevent it from becoming negative. Unfortunately, the code below does not work. Any hints? Thanks a lot!
# Imports
from scipy.optimize import curve_fit
import numpy as np
import matplotlib.pyplot as plt
xData = [0.0009824379203203417, 0.0011014182912933933, 0.0012433979929054324, 0.0014147106052612918, 0.0016240300315499524, 0.0018834904507916608, 0.002210485320720769, 0.002630660216394964, 0.0031830988618379067, 0.003929751681281367, 0.0049735919716217296, 0.0064961201261998095, 0.008841941282883075, 0.012732395447351627, 0.019894367886486918, 0.0353677651315323, 0.07957747154594767, 0.3183098861837907]
yData = [99.61973156923796, 91.79478510744039, 92.79302188621314, 84.32927272723863, 77.75060981602016, 75.62801782349504, 70.48026800610839, 72.21240551953743, 68.14019252499526, 55.23015406920851, 57.212682880377464, 50.777016257727176, 44.871140881319626, 40.544138806850846, 32.489105158795525, 25.65367127756607, 19.894206907130403, 13.057996247388862]
def func(x,m,c,d):
'''
Fitting Function
I put d as an absolute number to prevent negative values for d?
'''
return x**m * c + abs(d)
p0 = [-1, 1, 1]
coeff, _ = curve_fit(func, xData, yData, p0) # Fit curve
m, c, d = coeff[0], coeff[1], coeff[2]
print("d: " + str(d)) # Why is it negative!!
Your model actually works fine as the following plot shows. I used your code and plotted the original data and the data you obtain with the fitted parameters:
As you can see, the data can nicely be reproduced but you indeed obtain a negative value for d (which must not be a bad thing depending on the context of the model). If you want to avoid it, I recommend to use lmfit where you can constrain your parameters to certain ranges. The next plot shows the outcome.
As you can see, it also reproduces the data well and you obtain a positive value for d as desired.
namely:
m: -0.35199747
c: 8.48813181
d: 0.05775745
Here is the entire code that reproduces the figures:
# Imports
from scipy.optimize import curve_fit
import numpy as np
import matplotlib.pyplot as plt
#additional import
from lmfit import minimize, Parameters, Parameter, report_fit
xData = [0.0009824379203203417, 0.0011014182912933933, 0.0012433979929054324, 0.0014147106052612918, 0.0016240300315499524, 0.0018834904507916608, 0.002210485320720769, 0.002630660216394964, 0.0031830988618379067, 0.003929751681281367, 0.0049735919716217296, 0.0064961201261998095, 0.008841941282883075, 0.012732395447351627, 0.019894367886486918, 0.0353677651315323, 0.07957747154594767, 0.3183098861837907]
yData = [99.61973156923796, 91.79478510744039, 92.79302188621314, 84.32927272723863, 77.75060981602016, 75.62801782349504, 70.48026800610839, 72.21240551953743, 68.14019252499526, 55.23015406920851, 57.212682880377464, 50.777016257727176, 44.871140881319626, 40.544138806850846, 32.489105158795525, 25.65367127756607, 19.894206907130403, 13.057996247388862]
def func(x,m,c,d):
'''
Fitting Function
I put d as an absolute number to prevent negative values for d?
'''
print m,c,d
return np.power(x,m)*c + d
p0 = [-1, 1, 1]
coeff, _ = curve_fit(func, xData, yData, p0) # Fit curve
m, c, d = coeff[0], coeff[1], coeff[2]
print("d: " + str(d)) # Why is it negative!!
plt.scatter(xData, yData, s=30, marker = "v",label='P')
plt.scatter(xData, func(xData, *coeff), s=30, marker = "v",color="red",label='curvefit')
plt.show()
#####the new approach starts here
def func2(params, x, data):
m = params['m'].value
c = params['c'].value
d = params['d'].value
model = np.power(x,m)*c + d
return model - data #that's what you want to minimize
# create a set of Parameters
params = Parameters()
params.add('m', value= -2) #value is the initial condition
params.add('c', value= 8.)
params.add('d', value= 10.0, min=0) #min=0 prevents that d becomes negative
# do fit, here with leastsq model
result = minimize(func2, params, args=(xData, yData))
# calculate final result
final = yData + result.residual
# write error report
report_fit(params)
try:
import pylab
pylab.plot(xData, yData, 'k+')
pylab.plot(xData, final, 'r')
pylab.show()
except:
pass
You could use the scipy.optimize.curve_fit method's bounds option to specify the maximum bound and the minimum bound.
https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.curve_fit.html
Bounds is a two tuple array. In your case, you just need to specify the lower bound for d. You could use,
bounds=([-np.inf, -np.inf, 0], np.inf)
Note: If you provide a scalar as a parameter (eg:- as the second variable above), it automatically applies as the upper bound for all three coefficients.
You just need to add one little argument to constrain your parameters. That is:
curve_fit(func, xData, yData, p0, bounds=([m1,c1,d1],[m2,c2,d2]))
where m1,c1,d1 are the lower bounds of the parameters (in your case they should be 0) and
m2,c2,d2 are the upper bounds.
If u want all m,c,d to be positive, the code should goes like the following:
curve_fit(func, xData, yData, p0, bounds=(0,numpy.inf))
where all the parameters have a lower bound of 0 and an upper bound of infinity(no bound)

Categories

Resources