Standardized data of SVM - Scikit-learn/ Python - python

This is for an assignment where the SVM methods has to be used for model accuracy.
There were 3 parts, wrote the below code
import sklearn.datasets as datasets
import sklearn.model_selection as ms
from sklearn.model_selection import train_test_split
digits = datasets.load_digits();
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=30, stratify=y)
print(X_train.shape)
print(X_test.shape)
from sklearn.svm import SVC
svm_clf = SVC().fit(X_train, y_train)
print(svm_clf.score(X_test,y_test))
But after this, the question is as below
Perform Standardization of digits.data and store the transformed data
in variable digits_standardized.
Hint : Use required utility from sklearn.preprocessing. Once again,
split digits_standardized into two sets names X_train and X_test.
Also, split digits.target into two sets Y_train and Y_test.
Hint: Use train_test_split method from sklearn.model_selection; set
random_state to 30; and perform stratified sampling. Build another SVM
classifier from X_train set and Y_train labels, with default
parameters. Name the model as svm_clf2.
Evaluate the model accuracy on testing data set and print it's score.
On top of the above code, tried writing this, but seems to be failing. Can anyone help on how the data can be standardized.
std_scale = preprocessing.StandardScaler().fit(X_train)
X_train_std = std_scale.transform(X_train)
X_test_std = std_scale.transform(X_test)
svm_clf2 = SVC().fit(X_train, y_train)
print(svm_clf.score(X_test,y_test))

Tried the below. Seems to be working.
import sklearn.datasets as datasets
import sklearn.model_selection as ms
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
digits = datasets.load_digits();
X = digits.data
scaler = StandardScaler()
scaler.fit(X)
digits_standardized = scaler.transform(X)
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(digits_standardized, y, random_state=30, stratify=y)
#print(X_train.shape)
#print(X_test.shape)
from sklearn.svm import SVC
svm_clf2 = SVC().fit(X_train, y_train)
print("Accuracy ",svm_clf2.score(X_test,y_test))

Try this as final code includes all Tasks
import sklearn.datasets as datasets
import sklearn.model_selection as ms
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
digits = datasets.load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=30, stratify=y)
print(X_train.shape)
print(X_test.shape)
svm_clf = SVC().fit(X_train, y_train)
print(svm_clf.score(X_test,y_test))
scaler = StandardScaler()
scaler.fit(X)
digits_standardized = scaler.transform(X)
X_train, X_test, y_train, y_test = train_test_split(digits_standardized, y, random_state=30, stratify=y)
svm_clf2 = SVC().fit(X_train, y_train)
print(svm_clf2.score(X_test,y_test))

Related

How to merge predicted values to original pandas test data frame where X_test has been converted using CountVectorizer before splitting

I want to merge my predicted results of my test data to my X_test. I was able to merge it with y_test but since my X_test is a corpus I'm not sure how I can identify the indexes to merge.
My codes are as below
def lr_model(df):
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd
# Create corpus as a list
corpus = df['text'].tolist()
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, -1].values
# Splitting to testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# Train Logistic Regression on Training set
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Merge true vs predicted labels
true_vs_pred = pd.DataFrame(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))
return true_vs_pred
This gives me the y_test and y_pred but I'm not sure how I can add the X_test as an original data frame (the ids of the X_test) to this.
Any guidance is much appreciated. Thanks
Using a pipeline can help you link the original X_test with the prediction:
def lr_model(df):
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.pipeline import Pipeline
# Defining X and y
cv = CountVectorizer()
X = df['text']
y = df.iloc[:, -1].values
# Splitting to testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# Create a pipeline
pipeline = Pipeline([
('CountVectorizer', cv),
('LogisticRegression', LogisticRegression(random_state = 0)),
])
# Train pipeline on Training set
pipeline.fit(X_train, y_train)
# Predicting the Test set results
y_pred = pipeline.predict(X_test)
return X_test, y_test, y_pred

Sklearn - Pipeline with StandardScaler, PolynomialFeatures and Regression

I have the following model which scales the data, then uses polynomial features and finally feeds the data into a regression model with regularization, like so:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
polynomial = PolynomialFeatures(degree=3, include_bias=False)
polynomial.fit(X_train_scaled)
X_train_model = polynomial.transform(X_train_scaled)
X_test_model = polynomial.transform(X_test_scaled)
reg_model = Ridge(alpha=alpha)
reg_model.fit(X_train_model, y_train)
y_pred_train_model = reg_model.predict(X_train_model)
r2_train = r2_score(y_train, y_pred_train_model)
y_pred_test_model = reg_model.predict(X_test_model)
r2_test = r2_score(y_test, y_pred_test_model)
It works fine, but seems a bit cumbersome with many fits and transformations. I've heard about this Pipeline() method in sklearn. How can I use it above in order to simplify the process?
You can rewrite your code with Pipeline() as follows:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
# generate the data
X, y = make_regression(n_samples=1000, n_features=100, noise=10, bias=1, random_state=42)
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# define the pipeline
pipe = Pipeline(steps=[
('scaler', StandardScaler()),
('preprocessor', PolynomialFeatures(degree=3, include_bias=False)),
('estimator', Ridge(alpha=1))
])
# fit the pipeline
pipe.fit(X_train, y_train)
# generate the model predictions
y_pred_train_pipe = pipe.predict(X_train)
print(y_pred_train_pipe[:5])
# [11.37182811 89.22027129 -106.51012773 79.5912864 -241.0138516]
y_pred_test_pipe = pipe.predict(X_test)
print(y_pred_test_pipe[:5])
# [16.88238278 57.50116009 50.35705205 -20.92005052 -76.04156972]
# calculate the r-squared
print(pipe.score(X_train, y_train))
# 0.9999999999787197
print(pipe.score(X_test, y_test))
# 0.463044896596684
Equivalent code without Pipeline():
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
# generate the data
X, y = make_regression(n_samples=1000, n_features=100, noise=10, bias=1, random_state=42)
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# scale the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
# extract the polynomial features
polynomial = PolynomialFeatures(degree=3, include_bias=False)
polynomial.fit(X_train_scaled)
X_train_model = polynomial.transform(X_train_scaled)
X_test_model = polynomial.transform(X_test_scaled)
# fit the model
reg_model = Ridge(alpha=1)
reg_model.fit(X_train_model, y_train)
# generate the model predictions
y_pred_train_model = reg_model.predict(X_train_model)
print(y_pred_train_model[:5])
# [11.37182811 89.22027129 -106.51012773 79.5912864 -241.0138516]
y_pred_test_model = reg_model.predict(X_test_model)
print(y_pred_test_model[:5])
# [16.88238278 57.50116009 50.35705205 -20.92005052 -76.04156972]
# calculate the r-squared
print(r2_score(y_train, y_pred_train_model))
# 0.9999999999787197
print(r2_score(y_test, y_pred_test_model))
# 0.463044896596684

Machine Learning Using Scikit-Learn & SVM

Load popular digits dataset from sklearn.datasets module and assign it to variable digits.
Split digits.data into two sets names X_train and X_test. Also, split digits.target into two sets Y_train and Y_test.
Hint: Use train_test_split() method from sklearn.model_selection; set random_state to 30; and perform stratified sampling.
Build an SVM classifier from X_train set and Y_train labels, with default parameters. Name the model as svm_clf.
Evaluate the model accuracy on the testing data set and print its score.
I used the following code:
import sklearn.datasets as datasets
import sklearn.model_selection as ms
from sklearn.model_selection import train_test_split
digits = datasets.load_digits();
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=30)
print(X_train.shape)
print(X_test.shape)
from sklearn.svm import SVC
svm_clf = SVC().fit(X_train, y_train)
print(svm_clf.score(X_test,y_test))
I got the below output.
(1347,64)
(450,64)
0.4088888888888889
But I am not able to pass the test. Can someone help with what is wrong?
You are missing the stratified sampling requirement; modify your split to include it:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=30, stratify=y)
Check the documentation.

StandardScaling ruins the test data score for my linear regression

When I apply StandardScaler to my train data after traintestsplit, the score for the train data is ok but the score for my test data makes no sense.
I tried LinearRegression(normalize=True) and it also made the score of my test data go crazy.
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)
lr = LinearRegression()
lr.fit(X_train_sc, y_train)
print(lr.score(X_train_sc, y_train))
print(lr.score(X_test_sc, y_test))
Results are:
0.961269156232134
-1.5466488732709964e+19
Why??? Please, help!
Note: If I do not run the StandardScaler, then both my scores make perfect sense.

How to perform SMOTE with cross validation in sklearn in python

I have a highly imbalanced dataset and would like to perform SMOTE to balance the dataset and perfrom cross validation to measure the accuracy. However, most of the existing tutorials make use of only single training and testing iteration to perfrom SMOTE.
Therefore, I would like to know the correct procedure to perfrom SMOTE using cross-validation.
My current code is as follows. However, as mentioned above it only uses single iteration.
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())
clf_rf = RandomForestClassifier(n_estimators=25, random_state=12)
clf_rf.fit(x_train_res, y_train_res)
I am happy to provide more details if needed.
You need to perform SMOTE within each fold. Accordingly, you need to avoid train_test_split in favour of KFold:
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score
kf = KFold(n_splits=5)
for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
X_train = X[train_index]
y_train = y[train_index] # Based on your code, you might need a ravel call here, but I would look into how you're generating your y
X_test = X[test_index]
y_test = y[test_index] # See comment on ravel and y_train
sm = SMOTE()
X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train, y_train)
model = ... # Choose a model here
model.fit(X_train_oversampled, y_train_oversampled )
y_pred = model.predict(X_test)
print(f'For fold {fold}:')
print(f'Accuracy: {model.score(X_test, y_test)}')
print(f'f-score: {f1_score(y_test, y_pred)}')
You can also, for example, append the scores to a list defined outside.
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
cv = StratifiedKFold(n_splits=5)
for train_idx, test_idx, in cv.split(X, y):
X_train, y_train = X[train_idx], y[train_idx]
X_test, y_test = X[test_idx], y[test_idx]
X_train, y_train = SMOTE().fit_sample(X_train, y_train)
....
I think you can also solve this with a pipeline from the imbalanced-learn library.
I saw this solution in a blog called Machine Learning Mastery https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
The idea is to use a pipeline from imblearn to do the cross-validation. Please, let me know if that works. The example below is with a decision tree, but the logic is the same.
#decision tree evaluated on imbalanced dataset with SMOTE oversampling
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
# define dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
# define pipeline
steps = [('over', SMOTE()), ('model', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
score = mean(scores))

Categories

Resources