I'm new to machine learning in python and have seen the concept of stacking models and wanted to give it a shot. The problem is i don't know how predict new data as i don't fully understand machine learning implementation in python. the code that i managed scrap looks like this:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from vecstack import stacking
import pandas as pd
X = pd.read_csv('db/file_name3.csv')
y = pd.read_csv('db/train_labels(1).csv')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
models = [
CatBoostRegressor(iterations=200,
learning_rate=0.03,
depth=4,
loss_function='RMSE',
eval_metric='RMSE',
random_seed=99,
od_type='Iter',
od_wait=50,
logging_level='Silent'),
CatBoostRegressor(iterations=500,
learning_rate=0.06,
depth=3,
loss_function='RMSE',
eval_metric='RMSE',
random_seed=99,
od_type='Iter',
od_wait=50,
logging_level='Silent'),
ExtraTreesRegressor(random_state = 0, n_jobs = -1,
n_estimators = 100, max_depth = 3),
RandomForestRegressor(random_state = 0, n_jobs = -1,
n_estimators = 300, max_depth = 3),
XGBRegressor(eta=0.02,reg_lambda=5,reg_alpha=1),
XGBRegressor(eta=0.1,reg_lambda=1,reg_alpha=10),
XGBRegressor(eta=0.02,reg_lambda=1,reg_alpha=10,n_estimators=300),
XGBRegressor(eta=0.012,max_depth=3,n_estimators=200),
GradientBoostingRegressor(),
BaggingRegressor(),
]
test1= pd.read_csv('db/Cleaned Data.csv')
S_train, S_test = stacking(models, X_train, y_train, X_train,
regression = True, metric = mean_absolute_error, n_folds = 10 ,
shuffle = True, random_state = 0, verbose = 2)
model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
print(y_pred.shape)
as you can see test1 is the data that i want to predict but could't figure it out. I can predict the data from my training set but not the new one. I have not changed any of the parameters of the models from the documentation.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
# load your X, y and test1 data here
RF = RandomForestRegressor(random_state = 0, n_jobs = -1,
n_estimators = 300, max_depth = 3)
# Validation function
n_folds = 5
def rmsle_cv(model):
kf = KFold(n_folds, shuffle=True,
random_state=42).get_n_splits(X.values)
rmse= np.sqrt(-cross_val_score(model, X.values, y.values.ravel(),
scoring="neg_mean_squared_error", cv = kf))
return(rmse)
score = rmsle_cv(RF)
print("Random Forest score: {:.3f} ({:.3f})\n".format(score.mean(),
score.std()))
RF.fit(X,y.values.ravel())
RF_train_pred = RF.predict(X)
RF_pred = RF.predict(test1)
print (RF_pred.shape)
Related
I was going to classify into the following dataset.
Dataset
I achieved amazingly high results (too high) and I think I had to do something wrong.
I'm trying to make a clasification for the age group based on the rest of the characteristics. I know there is a large correlation between variables and I will have to fix it all but for now I wanted to use everything as dependent variables.
Here is the code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
dataset = pd.read_csv('abalone.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1:].values
dataset["Sex"]= dataset["Sex"].replace('M', 0)
dataset["Sex"]= dataset["Sex"].replace('F', 1)
dataset["Sex"]= dataset["Sex"].replace('I', 2)
dataset["Age group"]= dataset["Age group"].replace('young abalone', 0)
dataset["Age group"]= dataset["Age group"].replace('middle-aged abalone', 1)
dataset["Age group"]= dataset["Age group"].replace('mature abalone', 2)
dataset["Age group"]= dataset["Age group"].replace('senior abalone', 3)
dataset['Age group'] = dataset['Age group'].astype(int).astype(float)
dataset['Rings'] = dataset['Rings'].astype(int).astype(float)
dataset['Sex'] = dataset['Sex'].astype(int).astype(float)
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1:].values
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
X[:, 0] = labelencoder.fit_transform(X[:, 0])
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()
# Avoiding the Dummy Variable Trap
X = X[:, 1:]
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm
## KNN
# Fitting K-NN to the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier2 = KNeighborsClassifier(n_neighbors = 7, metric = 'minkowski', p = 2)
classifier2.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier2.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm
I put all the code because I don't know exactly what went wrong.
I haven't checked yet other accuracy measures because the confusion matrix itself says something probably went wrong
I tried Random Forests regression.
The code is given below.
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
np.random.seed(0)
d1 = np.random.randint(2, size=(50, 10))
d2 = np.random.randint(3, size=(50, 10))
d3 = np.random.randint(4, size=(50, 10))
Y = np.random.randint(7, size=(50,))
X = np.column_stack([d1, d2, d3])
n_smples, n_feats = X.shape
print (n_smples, n_feats)
kf = KFold(n_splits=5, shuffle=True, random_state=0)
regr = RandomForestRegressor(max_features=None,random_state=0)
pipe = make_pipeline(RFECV(estimator=regr, step=3, cv=kf, scoring =
'neg_mean_squared_error', n_jobs=-1),
GridSearchCV(regr, param_grid={'n_estimators': [100, 300]},
cv=kf, scoring = 'neg_mean_squared_error',
n_jobs=-1))
ypredicts = cross_val_predict(pipe, X, Y, cv=kf, n_jobs=-1)
rmse = mean_squared_error(Y, ypredicts)
print (rmse)
However, I got the following error:
sklearn.exceptions.NotFittedError: Estimator not fitted, call fit before exploiting the model.
I also tried:
model = pipe.fit(X,Y)
ypredicts = cross_val_predict(model, X, Y, cv=kf, n_jobs=-1)
But got the same error.
Edit 1:
I also tried:
pipe.fit(X,Y)
But got the same error.
In Python 2.7 (Sklearn 0.20), for the same code I got different error:
TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.
In Python 2.7 (Sklearn 0.20.3):
NotFittedError: Estimator not fitted, callfitbefore exploiting the model.
It seems that you are trying to select best parameters for your classifier by using grid search their is a another to do so. You are using pipelines but in this method I am not using pipeline but I am getting best parameters through random search.
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
np.random.seed(0)
d1 = np.random.randint(2, size=(50, 10))
d2 = np.random.randint(3, size=(50, 10))
d3 = np.random.randint(4, size=(50, 10))
Y = np.random.randint(7, size=(50,))
X = np.column_stack([d1, d2, d3])
n_smples, n_feats = X.shape
print (n_smples, n_feats)
kf = KFold(n_splits=5, shuffle=True, random_state=0)
regr = RandomForestRegressor(max_features=None,random_state=0)
n_iter_search = 20
random_search = RandomizedSearchCV(regr, param_distributions={'n_estimators': [100, 300]},
n_iter=20, cv=kf,verbose=1,return_train_score=True)
random_search.fit(X, Y)
ypredicts=random_search.predict(X)
rmse = mean_squared_error(Y, ypredicts)
print(rmse)
print(random_search.best_params_)
random_search.cv_results_
Try this piece of code. I hope this code is fullfilling your problem.
Instead of
model = pipe.fit(X,Y)
have you tried
pipe.fit(X,Y)
instead?
so that would be
pipe.fit(X,Y)
# change model to pipe
ypredicts = cross_val_predict(pipe, X, Y, cv=kf, n_jobs=-1)
I'm unable to match LGBM's cv score by hand.
Here's a MCVE:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import numpy as np
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
folds = KFold(5, random_state=42)
params = {'random_state': 42}
results = lgb.cv(params, lgb.Dataset(X_train, y_train), folds=folds, num_boost_round=1000, early_stopping_rounds=100, metrics=['auc'])
print('LGBM\'s cv score: ', results['auc-mean'][-1])
clf = lgb.LGBMClassifier(**params, n_estimators=len(results['auc-mean']))
val_scores = []
for train_idx, val_idx in folds.split(X_train):
clf.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
val_scores.append(roc_auc_score(y_train.iloc[val_idx], clf.predict_proba(X_train.iloc[val_idx])[:,1]))
print('Manual score: ', np.mean(np.array(val_scores)))
I was expecting the two CV scores to be identical - I have set random seeds, and done exactly the same thing. Yet they differ.
Here's the output I get:
LGBM's cv score: 0.9851513530737058
Manual score: 0.9903622177441328
Why? Am I not using LGMB's cv module correctly?
You are splitting X into X_train and X_test.
For cv you split X_train into 5 folds while manually you split X into 5 folds. i.e you use more points manually than with cv.
change results = lgb.cv(params, lgb.Dataset(X_train, y_train) to results = lgb.cv(params, lgb.Dataset(X, y)
Futhermore, there can be different parameters. For example, the number of threads used by lightgbm changes the result. During cv the models are fitted in parallel. Hence the number of threads used might differ from your manual sequential training.
EDIT after 1st correction:
You can achieve the same results using manual splitting / cv using this code:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import numpy as np
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
folds = KFold(5, random_state=42)
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective':'binary',
'metric':'auc',
}
data_all = lgb.Dataset(X_train, y_train)
results = lgb.cv(params, data_all,
folds=folds.split(X_train),
num_boost_round=1000,
early_stopping_rounds=100)
print('LGBM\'s cv score: ', results['auc-mean'][-1])
val_scores = []
for train_idx, val_idx in folds.split(X_train):
data_trd = lgb.Dataset(X_train.iloc[train_idx],
y_train.iloc[train_idx],
reference=data_all)
gbm = lgb.train(params,
data_trd,
num_boost_round=len(results['auc-mean']),
verbose_eval=100)
val_scores.append(roc_auc_score(y_train.iloc[val_idx], gbm.predict(X_train.iloc[val_idx])))
print('Manual score: ', np.mean(np.array(val_scores)))
yields
LGBM's cv score: 0.9914524426410262
Manual score: 0.9914524426410262
What makes the difference is this line reference=data_all. During cv, the binning of the variables (refers to lightgbm doc) is constructed using the whole dataset (X_train) while in you manual for loop it was built on the training subset (X_train.iloc[train_idx]). By passing the reference to the dataset containg all the data, lightGBM will reuse the same binning, giving same results.
I am using TransformedTargetRegressor to transform my target into log space. It is done like
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.compose import TransformedTargetRegressor
clf = TransformedTargetRegressor(regressor=GradientBoostingRegressor(**params),
func=np.log1p, inverse_func=np.expm1)
However when I later call
feature_importance = clf.feature_importances_
I get
AttributeError: 'TransformedTargetRegressor' object has no attribute
'feature_importances_'
I would have thought that all the attributes of the original class would be inherited. How can this be solved?
For further context here is an official example. Replacing the initialization line with mine would result in the crash.
As TransformedTargetRegressor Doc says, one can access its component regressor through .regressor_.
So this is what you want:
clf.regressor_.feature_importances_
Workable code:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.compose import TransformedTargetRegressor #only in sklearn==0.20.2
# #############################################################################
# Load data
boston = datasets.load_boston()
X, y = shuffle(boston.data, boston.target, random_state=13)
X = X.astype(np.float32)
offset = int(X.shape[0] * 0.9)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]
# #############################################################################
# Fit regression model
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
'learning_rate': 0.01, 'loss': 'ls'}
#clf = ensemble.GradientBoostingRegressor(**params)
clf = TransformedTargetRegressor(regressor=GradientBoostingRegressor(**params),
func=np.log1p, inverse_func=np.expm1)
clf.fit(X_train, y_train)
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)
print(clf.regressor_.feature_importances_)
Its output:
MSE: 7.7145
[6.45223704e-02 1.32970011e-04 2.92221184e-03 4.48101769e-04
3.57392613e-02 2.02435922e-01 1.22755948e-02 7.03996426e-02
1.54903176e-03 1.90771421e-02 1.98577625e-02 1.63376111e-02
5.54302378e-01]
This is a question regarding best practices for sklearn.
While experimenting with SVMs using the iris dataset provided in the sklearn library. While using train_test_split, I was wondering which parameter to adjust to avoid overfitting. I was taught to adjust test_size (roughly to ~0.3), but there is a train_size parameter. Would it not make sense to adjust the train_size to avoid overfitting, or am I misunderstanding something here?
I get similar results regardless of which parameter I adjust, but I don't know if that's always the case.
Appreciate any help. Thanks!
Here is the code I am currently working with:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
scaler = StandardScaler()
scaler.fit(df)
scaled_df = scaler.transform(df)
df = pd.DataFrame(data=scaled_df, columns=iris.feature_names)
x = df
y = iris.target
#test_size is used here, but is swapped with train_size to experiment
x_train, x_test, y_train, y_test = tts(x, y, test_size=0.33)
c_param = np.arange(1, 100, 10)
gamma_param = np.arange(0.0001, 1, 0.001)
params = {'C':c_param, 'gamma':gamma_param}
grid = GridSearchCV(estimator=SVC(), param_grid=params, verbose=0)
grid_fit = grid.fit(x_train, y_train)
grid_pred = grid.predict(x_test)
print(grid.best_params_)
print('\n')
print("Number of training records: ", len(x_train))
print("Number of test records: ", len(x_test))
print('\n')
print(classification_report(y_true=y_test, y_pred=grid_pred))
print('\n')
print(confusion_matrix(y_true=y_test, y_pred=grid_pred))