How to extend the regression line in plot? - python

I did a cubic regression on the data below. How can I plot the regression line with x value starting from 0 rather than the minimum x?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
df = pd.DataFrame({'x':list(range(3,18)),'y':[-4,-2,0,3,5,8,12,17,21,23,24,25,26,26,24]})
x = df['x'].values.reshape(-1,1)
y = df['y'].values.reshape(-1,1)
cubic = PolynomialFeatures(degree=3)
x_cubic = cubic.fit_transform(x)
cubic.fit(x_cubic, y)
model = LinearRegression()
model.fit(x_cubic, y)
fig, ax = plt.subplots()
ax.scatter(x, y, color = 'blue')
pred = model.predict(cubic.fit_transform(x))
ax.plot(x, pred, color = 'red')
ax.set_xlim(0)
ax.set_ylim(-20)
This is what I have now.
How can I get a plot like this?

Try creating and extended x range like this and predicting with your existing model. Add this to the bottom of your code.
ex_x = np.arange(0,4).reshape(-1,1)
ex_pred = model.predict(cubic.fit_transform(ex_x))
ax.plot(ex_x, ex_pred, color='red', linestyle='--')
Output:

Related

Graphically show correlation between columns of a pandas dataframe

I have the following pandas dataframe covering more than 10k answers for 150 questions.
I am struggling to find a way to see the correlation between fields.
In particular I would like to understand how I can graphically show the correlation between Q015 and Q008, knowing that each respondent might have selected multiple answers (1,2,3).
So I am trying to figure out how to graphically display whether there is any correlation between Q015 and Q008 for each selected option of the survey.
Any ideas?
You can see a linear regression by Pearson
necessary libraries
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
Code
list_variables, list_COEF, list_MSE, list_RMSE, list_R2SCORE = ([] for i in range(5))
# initializing Linear Regression by Pearson
lr = LinearRegression()
xtrain, xtest, ytrain, ytest = train_test_split(df[["Q015"]], df[["Q008"]], test_size=0.3)
lr = LinearRegression()
lr_baseline = lr.fit(xtrain, ytrain)
pred_baseline = lr_baseline.predict(xtest)
list_variables.append("Q015 & Q008")
list_COEF.append(round(lr_baseline.coef_[0,0], 4))
list_MSE.append(round(mean_squared_error(ytest, pred_baseline), 2))
list_RMSE.append(round(math.sqrt(mean_squared_error(ytest, pred_baseline)), 2))
list_R2SCORE.append(round(r2_score(ytest, pred_baseline), 2))
# Plotting the graph
plt.figure(figsize=(12,8))
ax = plt.gca()
plt.suptitle("Q015 & Q008", fontsize=24, y=0.96)
plt.plot(xtest, ytest, 'bo', markersize = 5)
plt.plot(xtest, pred_baseline, color="red", linewidth = 2)
plt.xlabel("Q015", size=14)
plt.ylabel("Q008", size=14)
plt.tight_layout()
plt.show()
You will get something as follows where the column Coef. says to you how much the variables are correlated
Another way is to see the matrix correlation
df_corr = pd.DataFrame(df[["Q015", "Q008"]].corr()).round(2)
mask = np.zeros_like(df_corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(10,8))
plt.title("Pearson correlation between features", size=20)
ax = sns.heatmap(df_corr, mask=mask, vmin=-1, cmap="mako_r")
plt.xticks(rotation=25, size=14, horizontalalignment="right")
plt.yticks(rotation=0, size=14)
plt.tight_layout()
plt.show()
An example for numeric columns
df = pd.DataFrame(np.random.randint(0,15, size=(100, 6)), columns=[["Q01", "Q02", "Q03", "Q07", "Q015", "Q008"]])

Polynomial Regression Curves in Python

I'm trying to create a regression curve for my data, with 2 degrees. When I create my graph, I get a funny zigzag thing:
but I want to model my data as an actual curve, which would look like the connected version of the scatter plot.
Any advice/better ways of doing this?
degree = 2
p = np.poly1d(np.polyfit(data['input'],y, degree))
plt.plot(data['input'], p(data['input']), c='r',linestyle='-')
plt.scatter(data['input'], p(data['input']), c='b')
Here, data['input'] is a column vector with the same dimensions as y.
Edit: I have also tried it like this:
X, y = np.array(data['input']).reshape(-1,1), np.array(data['output'])
lin_reg=LinearRegression(fit_intercept=False)
lin_reg.fit(X,y)
poly_reg=PolynomialFeatures(degree=2)
X_poly=poly_reg.fit_transform(X)
poly_reg.fit(X_poly,y)
lin_reg2=LinearRegression(fit_intercept=False)
lin_reg2.fit(X_poly,y)
X_grid=np.arange(min(X),max(X),0.1)
X_grid=X_grid.reshape((len(X_grid),1))
plt.scatter(X,y,color='red')
plt.plot(X,lin_reg2.predict(poly_reg.fit_transform(X)),color='blue')
plt.show()
Which gives me this graph here.
The scatter is my data and the blue zigzag is what is SUPPOSED to be a quadratic curve modelling the data. Help?
In your plot you just plot from point to point with straight lines (where your y value is the approximated y from your polyfit function).
I would skip the polyfit function (because you have all y values you are interested in) and just interpolate the data['input'] and y with BSplines function make_interp_spline from scipy and plot the new y values with your interested range of x.
import numpy as np
import matplotlib.pyplot as plt
import scipy.interpolate as interp
plots just from point to point (zigzag)
x = np.array([1, 2, 3, 4])
y = np.array([75, 0, 25, 100])
plt.plot(x, y)
interpolates the points
x_new = np.linspace(1, 4, 300)
a_BSpline = interp.make_interp_spline(x, y)
y_new = a_BSpline(x_new)
plt.plot(x_new, y_new)
Try this and then adjust with your data! :)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
#improve degree = 3
p_reg = PolynomialFeatures(degree = 3)
X_poly = p_reg.fit_transform(X)
#again create new linear regression obj
reg2 = LinearRegression()
reg2.fit(X_poly,y)
plt.scatter(X, y, color = 'b')
plt.xlabel('Level')
plt.ylabel('Salary')
plt.title("Truth or Bluff")
# predicted values
plt.plot(X, reg2.predict(X_poly), color='r')
plt.show()
With Degree 3
With Degree 4

Incorrect x axis on Matplotlib when doing polynomial linear regression

The following code results in an x axis that ranges from 8 to 18. The data for the x axis actually ranges from 1,000 to 50 million. I would expect a log scale to show (10,000), (100,000), (1,000,000) (10,000,000) etc.
How do i fix the x axis?
dataset = pandas.DataFrame(Transactions, Price)
dataset = dataset.drop_duplicates()
import numpy as np
import matplotlib.pyplot as plt
X=dataset[['Transactions']]
y=dataset[['Price']]
log_X =np.log(X)
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=2)
X_poly = poly_reg.fit_transform(log_X)
pol_reg = LinearRegression()
pol_reg.fit(X_poly, y)
def viz_polymonial():
plt.scatter(log_X, y, color='red')
plt.plot(log_X, pol_reg.predict(poly_reg.fit_transform(log_X)), color='blue')
plt.title('Price Curve')
plt.xlabel('Transactions')
plt.ylabel('Price')
plt.grid(linestyle='dotted')
plt.show()
return
viz_polymonial()
Plot:
You plot the values of log_X with log-scale. It's double-logged. Plot just X with log scale, or np.exp(log_X).
No you are not even using log-scale. Plot X wiht log-scale: plt.xscale("log"), not log_X with normal scale.

Is it possible to set the color for the bottom region with `mlxtend.plotting`?

I am trying to reproduce the example in this post, which produces this figure.
The colored regions above are plotted by mlxtend.plotting (version '0.14.0').
With the default settings on colab, this code
from mlxtend.plotting import plot_decision_regions
plot_decision_regions(X, y, clf=ppn)
produces this figure.
The data points have been plotted while the bottom region has not.
Is it possible to set the color for the bottom region with mlxtend.plotting?
it seems like a bug derived by the classification of two regions, if you try and separate 3 clusters as the following example it will work.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import itertools
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from mlxtend.data import iris_data
from mlxtend.plotting import plot_decision_regions
# Initializing Classifiers
clf1 = LogisticRegression(random_state=0)
clf2 = RandomForestClassifier(random_state=0)
clf3 = SVC(random_state=0, probability=True)
eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
weights=[2, 1, 1], voting='soft')
# Loading some example data
X, y = iris_data()
X = X[:,[0, 2]]
# Plotting Decision Regions
gs = gridspec.GridSpec(2, 2)
fig = plt.figure(figsize=(10, 8))
labels = ['Logistic Regression',
'Random Forest',
'RBF kernel SVM',
'Ensemble']
for clf, lab, grd in zip([clf1, clf2, clf3, eclf],
labels,
itertools.product([0, 1],
repeat=2)):
clf.fit(X, y)
ax = plt.subplot(gs[grd[0], grd[1]])
fig = plot_decision_regions(X=X, y=y,
clf=clf, legend=2)
plt.title(lab)
plt.show()
Try and ask directly on their github directory: https://github.com/rasbt/mlxtend
I think it's possible. You can use the colors parameter instead, I think it is much easier. You should try this one, is this what you are looking for?
fig = plot_decision_regions(
X=X,
y=y.astype(int),
clf=clf,
legend=2,
colors='yellow,red'
)

extracting data from sns.kdeplot python

Is it possible to extract the data from a sns.kdeplot() before plotting?
ie. without using the function
y.get_lines()[0].get_data() post plotting
This can be done by extracting the line data from the matplotlib Axes object:
import numpy as np
from seaborn import kdeplot
my_data = np.random.randn(1000)
my_kde = kdeplot(my_data)
line = my_kde.lines[0]
x, y = line.get_data()
fig, ax = plt.subplots()
ax.plot(x[x>0], y[x>0])
alternatively the statsmodels way:
import statsmodels.api as sm
dens = sm.nonparametric.KDEUnivariate(np.random.randn(1000))
dens.fit()
x =np.linspace(0,1,100) #restrict range to (0,1)
y = dens.evaluate(x)
plt.plot(x,y)
Based on statsmodels's documentation:
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
# generate bimodal disrtibution
X1 = np.random.normal(100, 10, 250)
X2 = np.random.normal(10, 20, 250)
X = np.concatenate([X1, X2])
# get density from seaborn
x, y = sns.kdeplot(X).lines[0].get_data()
# get density from statsmodel
kde = sm.nonparametric.KDEUnivariate(X).fit()
xx, yy = (kde.support, kde.density)
# compare outputs
plt.plot(x, y, label='from sns')
plt.plot(xx, yy, label='from statsmodels')
plt.legend()

Categories

Resources