I am following some SVC code in a book using moon_dataset.
here is the code:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.datasets import make_moons
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
X, y = make_moons(n_samples=100, noise=0.15)
rbf_kernel_svm_clf = Pipeline([
("scaler", StandardScaler()),
("svm_clf", SVC(kernel="rbf", gamma=5, C=0.001))
])
rbf_kernel_svm_clf.fit(X, y)
am i have tried plotting any of these graphs with the following code but nothing so far.
plt.scatter(X, y)
any help? thanks
You need something more than just a scatter plot to plot the decision regions. A very useful module for this is MLxtend, which makes it very easy to plot the decision regions of a fitted model with plot_decision_regions. Here's how you could get it done using your example:
from mlxtend.plotting import plot_decision_regions
plt.figure(figsize=(12,8))
plot_decision_regions(X, y, clf=rbf_kernel_svm_clf.named_steps['svm_clf'], legend=2)
Related
I have some data at work that is confidential so I can't share it here, but the dataset below illustrates the point quite well. Basically, I want to run a feature importance exercise to find the top independent features (in this case, RM, LSTAT, and DIS) that have the most influence on the dependent feature (MDEV). This is done! My question is...how can I use this model to find the IDs associated with the top independent features (RM, LSTAT, and DIS)?
After viewing the plot, is it simply sorting the dataframe, in descending order, by RM, LSTAT, and DIS, because these are the top most influential features that impact the dependent feature? I don't think it works like that, but maybe that's all it is. In this case, I am assuming RM, LSTAT, and DIS are the 'worst' features, given the context of my business needs.
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
#Loading the dataset
x = load_boston()
df = pd.DataFrame(x.data, columns = x.feature_names)
df["MEDV"] = x.target
X = df.drop("MEDV",1) #Feature Matrix
y = df["MEDV"] #Target Variable
df.head()
df['id'] = df.groupby(['MEDV']).ngroup()
df = df.sort_values(by=['MEDV'], ascending=True)
df.head(10)
names = df.columns
reg = RandomForestRegressor()
reg.fit(X, y)
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), reg.feature_importances_), names), reverse=True))
features = names
importances = reg.feature_importances_
indices = np.argsort(importances)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='#8f63f4', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')
plt.show()
This is my code so far, my goal is that I want my confusion matrix to show all the numbers in each box. Hope this is not too obvious and thanks in advance.
`
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, roc_curve, precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn import metrics
from mlxtend.plotting import plot_confusion_matrix
from string import digits
import pandas as pd
import matplotlib.pyplot as plt
y_actual = pd.Series([0,0,21,0,23,0,0,0,0,0,0,0,0,0,25], name='Actual')
y_pred = pd.Series([0,0,21,0,23,0,0,0,0,0,0,0,0,0,25], name='Predicted')
confm=pd.crosstab(y_actual,y_pred,margins=True)
df_conf_norm = confm.div(confm.sum(axis=1), axis="index")
def plot_confusion_matrix(conf_mat=confm,values_format = 'd', title='Matriz de Confusion', cmap=plt.cm.hot_r,show_normed=True):
plt.matshow(confm, cmap=cmap)
plt.colorbar()
show_absolute=True
tick_marks = np.arange(len(confm.columns))
plt.xticks(tick_marks, confm.columns, rotation=45)
plt.yticks(tick_marks, confm.index)
plt.ylabel(confm.index.name)
plt.xlabel(confm.columns.name)
plt.show()
confm = pd.crosstab(y_actual, y_pred)
plot_confusion_matrix(confm)
`
I tried to look for an answer here but the ones I've stumbled upon didn't work.
I am new to SciKit-Learn and I have been working on a regression problem (king county csv) on kaggle. I have been training a regression model to predict the price of the house and I wanted to plot the graph but I have no idea how to do so. I am using python 3.6. Any advice or suggestion would be greatly appreciated.
#importing numpy and pandas, seaborn
import numpy as np #linear algebra
import pandas as pd #datapreprocessing, CSV file I/O
import seaborn as sns #for plotting graphs
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
data = pd.read_csv('kc_house_data.csv')
data = data.drop('date',axis=1)
data = data.drop('id',axis=1)
X = data
Y = X['price'].values
X = X.drop('price', axis = 1).values
X_train, X_test, Y_train, Y_test = train_test_split (X, Y, test_size = 0.30, random_state=21)
reg = LinearRegression()
kfold = KFold(n_splits=15, random_state=21)
cv_results = cross_val_score(reg, X_train, Y_train, cv=kfold, scoring='r2')
print(cv_results)
round(np.mean(cv_results)*100, 2)
This is the code from sklearn: https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html
You can use matplotlib for plotting
import matplotlib.pyplot as plt
plt.figure(figsize=(16, 9))
plt.plot(cv_results)
plt.show()
There can be multiple type of plots you can use like simple line plot or scatter plot.
plt.barh(x, y) # for bar graph
plt.plot(x,y) # for line graph
plt.scatter(x,y) # for scatter graph
Seaborn is a very useful visualization library. So much so that you can use 'seaborn.regplot' to directly plot the data and regression-model-fit line. It directly takes in the predictor variable and response variable, and spits out the plot of data points and best fit line. Here is the link on how to use it:
https://seaborn.pydata.org/generated/seaborn.regplot.html
I have also done the same competition on kaggle.
For regressions I would go for a scatter plot:
import matplotlib as plt
plt.plot(x,y)
As for the visualisations on that particular competition I would use the following code:
# visualising some more outliers in the data values
fig, axs = plt.subplots(ncols=2, nrows=0, figsize=(12, 120))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
sns.color_palette("husl", 8)
for i, feature in enumerate(list(train[numeric]), 1):
if(feature=='MiscVal'):
break
plt.subplot(len(list(numeric)), 3, i)
sns.scatterplot(x=feature, y='SalePrice', hue='SalePrice', palette='Blues', data=train)
plt.xlabel('{}'.format(feature), size=15,labelpad=12.5)
plt.ylabel('SalePrice', size=15, labelpad=12.5)
for j in range(2):
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=12)
plt.legend(loc='best', prop={'size': 10})
plt.show()
I have actually uploaded the full code for that competition on my GitHub if you want to have a look ;) (I am currently in the top 14% on that competition).
I am looking for an efficient way to apply multiple sklearn clustering algorithm to multiple dataframes without too much repetition.
import pandas as pd
import numpy as np
from sklearn.datasets import make_moons,make_blobs
from sklearn.cluster import KMeans, DBSCAN
from matplotlib import pyplot
X1, y1 = make_moons(n_samples=100, noise=0.1)
X2, y2 = make_blobs(n_samples=100, centers=3, n_features=2)
And I want to apply both kmeans and dbscan on these datasets, but each datasets requires different parameters, how can I use a loop to apply multiple models to multiple data and eventually plot them out in a grid? Thanks.
You have create few dict for defining the hyperparams for each dataset|clustering_algo combination.
May be following approach could work for you! [Developed from sklearn clustering's documentation]
import pandas as pd
import numpy as np
from sklearn.datasets import make_moons,make_blobs
from sklearn.cluster import KMeans, DBSCAN
from matplotlib import pyplot as plt
noisy_moons = make_moons(n_samples=100, noise=0.1)
blobs = make_blobs(n_samples=100, centers=3 , center_box = (-1,1),cluster_std=0.1)
colors = np.array(['#377eb8', '#ff7f00', '#4daf4a',
'#f781bf', '#a65628', '#984ea3',
'#999999', '#e41a1c', '#dede00'])
#defining the clustering algo which we want to try
clustering_models = [KMeans,DBSCAN]
from collections import namedtuple
Model = namedtuple('Model', ['name', 'model'])
models = [Model(model.__module__.split('.')[-1][:-1], model)
for model in clustering_models]
#defn of params for each dataset|clustering_algo
datasets_w_hyperparams = [(noisy_moons[0],
{models[0][0]: {'n_clusters': 2}, models[1][0]: {'eps': .3, }}),
(blobs[0],
{models[0][0]: {'n_clusters': 2}, models[1][0]: {'eps': .1, }})]
f,axes=plt.subplots(len(datasets_w_hyperparams),len(models),figsize = (15,10))
for data_id,(dataset,params) in enumerate(datasets_w_hyperparams):
for model_id,model in enumerate(models):
ax = axes[data_id][model_id]
name, clus_model = model
pred = clus_model(**params[name]).fit_predict(dataset)
ax.scatter(dataset[:,0],dataset[:,1], s=20, color= colors[pred])
ax.set_title(name)
plt.show()
I am Having trouble with the last subplot. The last Crosstab plot appears by itself, and then the subplot has the first 2 subplots but the 3rd one is empty and contains no data. How can I graph it so that all 3 graphs come up in one figure and they share they same Y axis or 'Frequency'
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
#Data Exploration
data = sm.datasets.fair.load_pandas().data
data['affair'] = np.where(data['affairs'] > 0 , 1,0)
print(data)
print(data.groupby('affair').mean())
print(data.groupby('rate_marriage').mean())
plt.subplot(331)
data['educ'].hist()
plt.title('Histogram of Education')
plt.xlabel('Education Level')
plt.ylabel('Frequency')
plt.subplot(332)
data['rate_marriage'].hist()
plt.title('Histogram of Marriage Rating')
plt.xlabel('Marriage Rating')
plt.ylabel('Frequency')
plt.subplot(333)
pd.crosstab(data['rate_marriage'], data['affair'].astype(bool)).plot(kind='bar')
plt.title('Marriage Rating distribution by affair Status')
plt.xlabel('Marriage Rating')
plt.ylabel('Frequency')
plt.show()
You need to tell the pandas plotting function where to plot the data.
This can be achieved through the ax keyword.
ax= plt.subplot(333)
pd.crosstab(data['rate_marriage'], data['affair'].astype(bool)).plot(kind='bar', ax=ax)