How to count number of points above a least square fit? - python

I want to count the points above the least squares fits.
from sklearn.mixture import GaussianMixture
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from astropy.io import ascii
from scipy.stats import norm
from astropy.timeseries import LombScargle
from astropy import stats
data4= pd.read_csv('Standard Dev main pop.csv')
names4 = data4.columns
df3 = pd.DataFrame(data4, columns=names4)
df3.head()
#print(c)
data5= pd.read_csv('2 Sigma Main pop.csv')
names5 = data5.columns
df5 = pd.DataFrame(data5, columns=names5)
df5.head()
data6= pd.read_csv('3 Sigma main pop.csv')
names6 = data6.columns
df6 = pd.DataFrame(data6, columns=names6)
df6.head()
a=df5['Mean Mag']
b=df5['Std']
c=df6['Mean Mag']
d=df6['Std']
e=df3['Mean Mag']
f=df3['Std']
ax=plt.scatter(e,f, label=' All sources')
#ay=plt.scatter(c,d, label='3 Sigma from Median Std')
lstsq_coefs = np.polyfit(a, b, deg=2)
lstsq_preds = lstsq_coefs[0]*a**2 + lstsq_coefs[1]*a + lstsq_coefs[2]
plt.plot(a, lstsq_preds, linestyle="dashed", color="red", label="Least squares 2 sigma")
#ay=plt.scatter(c,d, label='3 Sigma from Median Std')
lstsq_coefs1 = np.polyfit(c ,d, deg=2)
lstsq_preds1 = lstsq_coefs1[0]*c**2 + lstsq_coefs1[1]*c + lstsq_coefs1[2]
plt.plot(c, lstsq_preds1, linestyle="dashed", color="black", label="Least squares 3 sigma")
plt.legend(loc='best',fontsize= 16)
plt.gcf().set_size_inches((12,10))
plt.ylim(0,0.1)
plt.show()
I want to count the number of points that lie above each least-squares fit. I have tried some extremely tedious methods which is not feasible in the long run.

You can compare them using numpy module:
import numpy as np
f = np.array(f)
lstsq_preds = np.array(lstsq_preds)
lstsq_preds1 = np.array(lstsq_preds1)
print("Number above least squares #1:", len(f[f > lstsq_preds]))
print("Number above least squares #2:", len(f[f > lstsq_preds]))
Note that, I just transform the arrays to numpy to make sure they are numpy. It might be unnecessary to use these transforming lines since you are dealing with pandas dataframe.

Related

Plotting a decaying exponential in Pycharm from a CSV file

I am trying to plot this data as a decaying exponential, all of the data has the same x values just the y values differ. y= a*[(-1)*exp(-x/t)].
I am not getting the correct chart when it goes through. csv file In the image is the type of curve I am looking for. I need to plot all of the data in csv (preferably on the same plot) in pycharm. I am relatively new to pycharm so I am starting from scratch! (excel just wouldn't behave for this data) Willing to start fresh as well if there is a simpler way of writing the code, I sparsed this together with some help from the internet.
import scipy.signal as scp
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import numpy.core.function_base
def decaying_exponential(x,a,t,c):
return a *(-1)* np.exp(-1 * (x) / t) + c
import os
for f in os.listdir("/Users/flyar/My Python Stuff/"):
print(f)
df = numpy.transpose(pd.read_csv("D:/Grad Lab/NMR/Data/T1 Data/mineral oil/F0009CH1.CSV", names= ['a','b','c','d']).to_numpy())
temp = scp.find_peaks(df[2], height = 0)
df_subset = [(df[1][n], df[2][n]) for n in temp[0]]
print(df_subset)
plt.scatter([df[2][n] for n in temp[0]], [df[1][n] for n in temp[0]])
y = np.linspace(min(df[2]), max(df[2]), 1000)
params, covs = curve_fit(decaying_exponential, [df[1][n] for n in temp[0][2::]],
[df[2][n] for n in temp[0][2::]], maxfev=10000)
print(params)
plt.plot(y, [decaying_exponential(l, 5, params[1], params[2]) for l in y])
plt.show()

Problem with plotting/calculating exponential curve (python, matplotlib, pandas)

I have some data that forms exponential curve and I'm trying to fit that curve to the data.
Unfortunately everything I have tried didn't work (I will spare you madness of the code).
The thing is that it works when I used a*x**2 +b*x + c or a*x**3 + b*x**2 +c*x + d with what I found on internet (using implementation(s) of from scipy.optimize import curve_fit). Again I will spare you my iterations of exp function.
Here is the data:
x,y
0.48995590396864286,8.109516054921031e-09
0.48995590396864286,8.09818090049968e-09
0.48995590396864286,8.103734197035667e-09
0.48995590396864286,8.110736963480639e-09
0.48995590396864286,8.09118823654877e-09
0.48995590396864286,8.12135991705394e-09
0.48995590396864286,8.122079043957364e-09
0.48995590396864286,8.128376050930522e-09
0.48995590396864286,8.157919899241163e-09
0.48661800486618,8.198100087712926e-09
0.48426150121065376,8.22138382076506e-09
0.48192771084337344,8.281557310731435e-09
0.4793863854266539,8.27420119872003e-09
0.47709923664122134,8.321514715516415e-09
0.47483380816714155,8.3552316463302e-09
0.47483380816714155,8.378564235036926e-09
0.47192071731949026,8.401917724613532e-09
0.4703668861712136,8.425994519752875e-09
0.4681647940074906,8.45965504646707e-09
0.4659832246039143,8.496218480906607e-09
0.46382189239332094,8.551849768778838e-09
0.46168051708217916,8.54285497435508e-09
0.46168051708217916,8.583748312156053e-09
0.46168051708217916,8.646661429014719e-09
0.4568296025582458,8.733501981255873e-09
0.45475216007276037,8.765708849715661e-09
0.45004500450045004,8.8589473576661e-09
0.44385264092321347,8.991513675928626e-09
0.4397537379067722,9.130861147033911e-09
0.43308791684711995,9.301055589581911e-09
0.4269854824935952,9.533957982742729e-09
0.42052144659377627,9.741467401775447e-09
0.41476565740356697,9.942960683024683e-09
0.4088307440719542,1.0205883938061429e-08
0.40176777822418647,1.0447121052453653e-08
0.3947887879984209,1.0747232046538825e-08
0.3895597974289053,1.1089181777589068e-08
0.3829950210647261,1.1466586145307001e-08
0.37664783427495296,1.1898726912256124e-08
0.3707823507601038,1.2248924384552248e-08
0.362844702467344,1.2806614625543388e-08
0.35676061362825545,1.3206507000963428e-08
0.35385704175513094,1.3625333143433576e-08
0.3460207612456747,1.4205592733074004e-08
0.34002040122407345,1.4793868231688043e-08
0.3348961821835231,1.545475512236522e-08
0.3287310979618672,1.6141630273450685e-08
0.32185387833923396,1.698004473312357e-08
0.3162555344718533,1.7677811603552503e-08
0.3111387678904792,1.858017339865837e-08
0.3037667071688943,1.9505998651376402e-08
0.29886431560071725,2.022694254385094e-08
0.2910360884749709,2.1353523243307723e-08
0.28457598178713717,2.2277591448622187e-08
0.2770083102493075,2.302804705798657e-08
0.2727024815925825,2.299784512552745e-08
If you believe this is exponentiel curve i would find linear fit of the log of the data.
# your data in a Dataframe
import pandas as pd
import numpy as np
df = pd.read_csv("data.csv", sep=",")
# get log of your data
log_y = np.log(df["y"])
# linear fit of your log (as exp(ln(ax + b)) = ax + b)
a, b = np.polyfit(df.x, log_y, 1)
# plot the fit
import matplotlib.pyplot as plt
plt.scatter(df.x, df.y, label="raw_data")
plt.plot(df.x, np.exp(a*df.x + b), label="fit")
plt.legend()

Fitting a quadratic function in python without numpy polyfit

I am trying to fit a quadratic function to some data, and I'm trying to do this without using numpy's polyfit function.
Mathematically I tried to follow this website https://neutrium.net/mathematics/least-squares-fitting-of-a-polynomial/ but somehow I don't think that I'm doing it right. If anyone could assist me that would be great, or If you could suggest another way to do it that would also be awesome.
What I've tried so far:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
ones = np.ones(3)
A = np.array( ((0,1),(1,1),(2,1)))
xfeature = A.T[0]
squaredfeature = A.T[0] ** 2
b = np.array( (1,2,0), ndmin=2 ).T
b = b.reshape(3)
features = np.concatenate((np.vstack(ones), np.vstack(xfeature), np.vstack(squaredfeature)), axis = 1)
featuresc = features.copy()
print(features)
m_det = np.linalg.det(features)
print(m_det)
determinants = []
for i in range(3):
featuresc.T[i] = b
print(featuresc)
det = np.linalg.det(featuresc)
determinants.append(det)
print(det)
featuresc = features.copy()
determinants = determinants / m_det
print(determinants)
plt.scatter(A.T[0],b)
u = np.linspace(0,3,100)
plt.plot(u, u**2*determinants[2] + u*determinants[1] + determinants[0] )
p2 = np.polyfit(A.T[0],b,2)
plt.plot(u, np.polyval(p2,u), 'b--')
plt.show()
As you can see my curve doesn't compare well to nnumpy's polyfit curve.
Update:
I went through my code and removed all the stupid mistakes and now it works, when I try to fit it over 3 points, but I have no idea how to fit over more than three points.
This is the new code:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
ones = np.ones(3)
A = np.array( ((0,1),(1,1),(2,1)))
xfeature = A.T[0]
squaredfeature = A.T[0] ** 2
b = np.array( (1,2,0), ndmin=2 ).T
b = b.reshape(3)
features = np.concatenate((np.vstack(ones), np.vstack(xfeature), np.vstack(squaredfeature)), axis = 1)
featuresc = features.copy()
print(features)
m_det = np.linalg.det(features)
print(m_det)
determinants = []
for i in range(3):
featuresc.T[i] = b
print(featuresc)
det = np.linalg.det(featuresc)
determinants.append(det)
print(det)
featuresc = features.copy()
determinants = determinants / m_det
print(determinants)
plt.scatter(A.T[0],b)
u = np.linspace(0,3,100)
plt.plot(u, u**2*determinants[2] + u*determinants[1] + determinants[0] )
p2 = np.polyfit(A.T[0],b,2)
plt.plot(u, np.polyval(p2,u), 'r--')
plt.show()
Instead using Cramer's Rule, actually solve the system using least squares. Remember that Cramer's Rule will only work if the total number of points you have equals the desired order of polynomial plus 1.
If you don't have this, then Cramer's Rule will not work as you're trying to find an exact solution to the problem. If you have more points, the method is unsuitable as we will create an overdetermined system of equations.
To adapt this to more points, numpy.linalg.lstsq would be a better fit as it solves the solution to the Ax = b by computing the vector x that minimizes the Euclidean norm using the matrix A. Therefore, remove the y values from the last column of the features matrix and solve for the coefficients and use numpy.linalg.lstsq to solve for the coefficients:
import numpy as np
import matplotlib.pyplot as plt
ones = np.ones(4)
xfeature = np.asarray([0,1,2,3])
squaredfeature = xfeature ** 2
b = np.asarray([1,2,0,3])
features = np.concatenate((np.vstack(ones),np.vstack(xfeature),np.vstack(squaredfeature)), axis = 1) # Change - remove the y values
determinants = np.linalg.lstsq(features, b)[0] # Change - use least squares
plt.scatter(xfeature,b)
u = np.linspace(0,3,100)
plt.plot(u, u**2*determinants[2] + u*determinants[1] + determinants[0] )
plt.show()
I get this plot now, which matches what the dashed curve is in your graph, also matching what numpy.polyfit gives you:

How can I sample the different components of a GMM distribution?

I have clustered my data (12000, 3) using sklearn Gaussian mixture model algorithm (GMM). I have 3 clusters. Each point of my data represents a molecular structure. I would like to know how could I sampled each cluster. I have tried with the function:
gmm = GMM(n_components=3).fit(Data)
gmm.sample(n_samples=20)
but it does preform a sampling of the whole distribution, but I need a sample of each one of the components.
Well this is not that easy since you need to calculate the eigenvectors of all covariance matrices. Here is some example code for a problem I studied
import numpy as np
from scipy.stats import multivariate_normal
import random
from operator import truediv
import itertools
from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import mixture
#import some data which can be used for gmm
mix = np.loadtxt("mixture.txt", usecols=(0,1), unpack=True)
#print(mix.shape)
color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold',
'darkorange'])
def plot_results(X, Y_, means, covariances, index, title):
#function for plotting the gaussians
splot = plt.subplot(2, 1, 1 + index)
for i, (mean, covar, color) in enumerate(zip(
means, covariances, color_iter)):
v, w = linalg.eigh(covar)
v = 2. * np.sqrt(2.) * np.sqrt(v)
u = w[0] / linalg.norm(w[0])
# as the DP will not use every component it has access to
# unless it needs it, we shouldn't plot the redundant
# components.
if not np.any(Y_ == i):
continue
plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
# Plot an ellipse to show the Gaussian component
angle = np.arctan(u[1] / u[0])
angle = 180. * angle / np.pi # convert to degrees
ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
ell.set_clip_box(splot.bbox)
ell.set_alpha(0.5)
splot.add_artist(ell)
plt.xlim(-4., 3.)
plt.ylim(-4., 2.)
gmm = mixture.GaussianMixture(n_components=3, covariance_type='full').fit(mix.T)
print(gmm.predict(mix.T))
plot_results(mix.T, gmm.predict(mix.T), gmm.means_, gmm.covariances_, 0,
'Gaussian Mixture')
So for my problem the resulting plot looked like this:
Edit: here the answer to your comment. I would use pandas to do this. Assume X is your feature matrix and y are your labels, then
import pandas as pd
y_pred = gmm.predict(X)
df_all_info = pd.concat([X,y,y_pred], axis=1)
In the resulting dataframe you can check all the information you want, you can even just exclude the samples the algorithm misclassified with:
df_wrong = df_all_info[df_all_info['name of y-column'] != df_all_info['name of y_pred column']]

SKLearn ElasticNetCV: Looking for a similar cross-validation-error-plot to Matlab's lassoPlot or R's plot(cv.glmnet(x,y))

I use sklearn.linear_model.ElasticNetCV and I would like to get a similar figure as Matlab provides with lassoPlot with plottype=CV or R's plot(cv.glmnet(x,y)), i.e., a plot of the cross validations errors over various alphas (note, in Matlab and R this parameter is called lambda). Here is an example:
import sklearn.linear_model as lm
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import scipy.stats as stats
# toy example
# generate 200 samples of five-dimensional artificial data X from a
# exponential distributions with various means:
X = np.zeros( (200,5 ) )
for col in range(5):
X[ :, col ] = stats.expon.rvs( scale=1.0/(col+1) )
# generate response data Y = X*r + eps where r has just two nonzero
# components, and the noise eps is normal with standard deviation 0.1:
r = np.array( [ 0, 2, 0, -3, 0 ] )
Y = np.dot(X,r) + sp.randn( 200 )*0.1
enet = lm.ElasticNetCV()
alphas,coefs, _ = enet.path( X, Y )
# plot regulization paths
plt.plot( -np.log10(alphas), coefs.T, linestyle='-' )
plt.show()
I would like to plot also in a separate figure the cross validation error for each alpha. But It seems that ElasticNetCV.path() does not return a mse vector. Is there a simliar functionality in sklearn to Matlab.lassoPlot with plottype='CV' see: http://de.mathworks.com/help/stats/lasso-and-elastic-net.html or R's cv.glmnet(x,y) https://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html. Alternatively, I would implement it using sklearn.cross_validation. Do you have any suggestions?

Categories

Resources