I'm writing a script that uses GPR to analyze and predict burn properties of different fuels. I've got good outputs for my test set, and now want to add a 95% confidence interval. When I try to implement the interval I get terrible results. Please send help.
#Gaussian Predictions for Ignition Delay
#September 14 2021
%matplotlib inline
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
#gpr = GaussianProcessRegressor()
kernel = C(1.0, (1e-3, 1e3))*RBF(10, (1e-2, 1e2))
gpr = GaussianProcessRegressor(kernel = kernel, n_restarts_optimizer = 9, alpha = 0.1, normalize_y = True)
gpr.fit(x_train, y_train)
y_prediction, std = gpr.predict(x_test, return_std = True)
confidence = std*1.96/np.sqrt(len(x_test))
confidence = confidence.reshape(-1,1)
# Plot the function, the prediction and the 95% confidence interval based on
# the MSE
plt.figure()
plt.plot(x_train, y_train, "b.", markersize=10, label="Observations")
plt.fill(x_test,
y_prediction-confidence,
y_prediction+confidence,
alpha=0.3,
fc="b",
ec="None",
label="95% confidence interval",
) #this plots confidence interval and fit it to my data
plt.plot(x_test, y_prediction, "r.", markersize=10, label="Prediction")
```[enter image description here][1]
[1]: https://i.stack.imgur.com/PItpi.png
Looking at this example from the sklearn docs
https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_noisy_targets.html#sphx-glr-auto-examples-gaussian-process-plot-gpr-noisy-targets-py
it looks like you need to adapt your plot function. For me, the following worked
plt.fill_between(
x_test.ravel(),
y_prediction - 1.96 * std,
y_prediction + 1.96 * std,
alpha=0.5,
label=r"95% confidence interval",
)
here, I generated data like in the sklearn example:
X = np.linspace(start=0, stop=10, num=1_000).reshape(-1, 1)
y = np.squeeze(X * np.sin(X))
rng = np.random.RandomState(1)
training_indices = rng.choice(np.arange(y.size), size=6, replace=False)
test_indices = [x for x in np.arange(y.size) if x not in training_indices]
x_train, y_train = X[training_indices], y[training_indices]
x_test, y_test = X[test_indices], y[test_indices]
Related
This code generates a graph of the regression line but the y-intercept taken from the LR model does not match the y-intercept on the graph. What am I missing? The script prints the y-intercept, taken from the model, as 152 but the graph shows it to be less than 100.
# Adapted from https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html#sphx-glr-auto-examples-linear-model-plot-ols-py
# Code source: Jaques Grobler
# License: BSD 3 clause
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y = True)
diabetes_X = diabetes_X[:, np.newaxis, 2]
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]
diabetes_y_train = diabetes_y[:-20]
diabetes_y_test = diabetes_y[-20:]
regr = linear_model.LinearRegression()
regr.fit(diabetes_X_train, diabetes_y_train)
diabetes_y_pred = regr.predict(diabetes_X_test)
# The y-intercept
print("y-intercept: \n", regr.intercept_)
plt.scatter(diabetes_X_test, diabetes_y_test, color="black")
plt.plot(diabetes_X_test, diabetes_y_pred, color="blue", linewidth=3)
plt.xlabel('x')
plt.ylabel('y')
plt.show()
Ouptut of the script:
y-intercept:
152.91886182616167
Your X axis goes negative so the intercept is correct at 0 in the middle of the graph.
https://www.kaggle.com/paree24/development-index
i am trying to get my LinearRegression line but it doesnt show it right, any ideas?
also if i remove:
plt.scatter(x_test**[:,0]**, y_test)
i get a warning ValueError: x and y must be the same size
there is another question regarding the graph, since the population column is in the billions, all the other figures (has shown in the picture below) are going to be close of equal to 0, can i fix this?
and also.... right now i have 3 different pictures showing my graphs in the beggining (because the same problem, the population column is too big)
plot1 = dataset.plot(x= "GDP ($ per capita)", y='Infant mortality ', style='o')
plot2 = dataset.plot(x= "Literacy (%)", y='Infant mortality ', style='o')
plot3 = dataset.plot(x= "Population", y='Infant mortality ', style='o')
plt.tight_layout()
plt.show()
is there any way i can show these graphs in 1 picture and not in 3 different pictures?
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from scipy.stats import shapiro
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import normalize
from math import sqrt
from sklearn.model_selection import cross_val_score
import seaborn as sns
dataset = pd.read_csv(r"C:\Users\coolh\Desktop\machine learning\lab1\Development.csv")
x = np.array(dataset.drop(columns= ["Area (sq. mi.)", "Pop. Density ", "Development Index", "Infant mortality "]))
y = np.array(dataset["Infant mortality "])
plot1 = dataset.plot(x= "GDP ($ per capita)", y='Infant mortality ', style='o')
plot2 = dataset.plot(x= "Literacy (%)", y='Infant mortality ', style='o')
plot3 = dataset.plot(x= "Population", y='Infant mortality ', style='o')
plt.tight_layout()
plt.show()
stat, p = shapiro(y)
print(f"показатель {p}")
print(f"статистика {stat}")
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=10)
regressor = LinearRegression()
regressor.fit(x_train, y_train)
print(f"regressor.intercept_ {regressor.intercept_}")
print(f"regressor.coef_{regressor.coef_}")
scores = cross_val_score(regressor, x, y, cv=5)
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
y_pred = regressor.predict(x_test)
df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
df1 = df.head(50)
df1.plot(kind = 'bar')
plt.grid(which ='major', color='green')
plt.grid(which='minor', color='red')
plt.show()
#RMSE
print(sqrt(mean_squared_error(y_test, y_pred)))
plt.scatter(x_test[:,0], y_test)
plt.plot(x_test, y_pred, color='green', linewidth=1)
plt.show()
enter image description here
The plot method accepts the ax keyword that lets you plot onto an existing figure. See the example below:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df = pd.DataFrame({**{"x": range(10)}, **{f"y{i}": np.random.normal(size=10) for i in range(5)}})
fig = plt.figure()
df.plot("x", "y0", ax=plt.gca())
df.plot("x", "y1", ax=plt.gca())
df.plot("x", "y2", ax=plt.gca())
plt.show()
Note: since you're new, a piece of advice that has served me well is to include a MWE. The code you posted is quite long and most of it is unrelated to the problem.
I have the following pandas dataframe covering more than 10k answers for 150 questions.
I am struggling to find a way to see the correlation between fields.
In particular I would like to understand how I can graphically show the correlation between Q015 and Q008, knowing that each respondent might have selected multiple answers (1,2,3).
So I am trying to figure out how to graphically display whether there is any correlation between Q015 and Q008 for each selected option of the survey.
Any ideas?
You can see a linear regression by Pearson
necessary libraries
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
Code
list_variables, list_COEF, list_MSE, list_RMSE, list_R2SCORE = ([] for i in range(5))
# initializing Linear Regression by Pearson
lr = LinearRegression()
xtrain, xtest, ytrain, ytest = train_test_split(df[["Q015"]], df[["Q008"]], test_size=0.3)
lr = LinearRegression()
lr_baseline = lr.fit(xtrain, ytrain)
pred_baseline = lr_baseline.predict(xtest)
list_variables.append("Q015 & Q008")
list_COEF.append(round(lr_baseline.coef_[0,0], 4))
list_MSE.append(round(mean_squared_error(ytest, pred_baseline), 2))
list_RMSE.append(round(math.sqrt(mean_squared_error(ytest, pred_baseline)), 2))
list_R2SCORE.append(round(r2_score(ytest, pred_baseline), 2))
# Plotting the graph
plt.figure(figsize=(12,8))
ax = plt.gca()
plt.suptitle("Q015 & Q008", fontsize=24, y=0.96)
plt.plot(xtest, ytest, 'bo', markersize = 5)
plt.plot(xtest, pred_baseline, color="red", linewidth = 2)
plt.xlabel("Q015", size=14)
plt.ylabel("Q008", size=14)
plt.tight_layout()
plt.show()
You will get something as follows where the column Coef. says to you how much the variables are correlated
Another way is to see the matrix correlation
df_corr = pd.DataFrame(df[["Q015", "Q008"]].corr()).round(2)
mask = np.zeros_like(df_corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(10,8))
plt.title("Pearson correlation between features", size=20)
ax = sns.heatmap(df_corr, mask=mask, vmin=-1, cmap="mako_r")
plt.xticks(rotation=25, size=14, horizontalalignment="right")
plt.yticks(rotation=0, size=14)
plt.tight_layout()
plt.show()
An example for numeric columns
df = pd.DataFrame(np.random.randint(0,15, size=(100, 6)), columns=[["Q01", "Q02", "Q03", "Q07", "Q015", "Q008"]])
The following code results in an x axis that ranges from 8 to 18. The data for the x axis actually ranges from 1,000 to 50 million. I would expect a log scale to show (10,000), (100,000), (1,000,000) (10,000,000) etc.
How do i fix the x axis?
dataset = pandas.DataFrame(Transactions, Price)
dataset = dataset.drop_duplicates()
import numpy as np
import matplotlib.pyplot as plt
X=dataset[['Transactions']]
y=dataset[['Price']]
log_X =np.log(X)
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=2)
X_poly = poly_reg.fit_transform(log_X)
pol_reg = LinearRegression()
pol_reg.fit(X_poly, y)
def viz_polymonial():
plt.scatter(log_X, y, color='red')
plt.plot(log_X, pol_reg.predict(poly_reg.fit_transform(log_X)), color='blue')
plt.title('Price Curve')
plt.xlabel('Transactions')
plt.ylabel('Price')
plt.grid(linestyle='dotted')
plt.show()
return
viz_polymonial()
Plot:
You plot the values of log_X with log-scale. It's double-logged. Plot just X with log scale, or np.exp(log_X).
No you are not even using log-scale. Plot X wiht log-scale: plt.xscale("log"), not log_X with normal scale.
I have this code that load_digits and uses an SVM model for predicting digits. But after fitting the model, its prediction on new values is incorrect and computes target values that do not correspond to the given input. Below is the code:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import svm
digits = datasets.load_digits()
my_OCR_model = svm.SVC(gamma = 0.001, C = 100)
X, y = digits.data[:-10], digits.target[:-10]
my_OCR_model.fit(X, y)
print(my_OCR_model.predict(X[[-6]]))
print(y[-6])
plt.imshow(digits.images[-6], cmap=plt.cm.gray_r, interpolation="nearest")
plt.show()
Remove the slicing.
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import svm
digits = datasets.load_digits()
my_OCR_model = svm.SVC(gamma = 0.001, C = 100)
X, y = digits.data, digits.target # remove slicing here
my_OCR_model.fit(X, y)
print(my_OCR_model.predict(X[[-6]]))
print(y[-6])
plt.imshow(digits.images[-6], cmap=plt.cm.gray_r, interpolation="nearest")
plt.show()
Alternatively, if you had a good reason for slicing, keep the same data for X, y, and images. Use this as the last line:
plt.imshow(digits.images[:-10][-6], cmap=plt.cm.gray_r, interpolation="nearest")
plt.show()