I've tried to do the wine_quality exercice on Kaggle:
Here is my code (the beginning):
X= data.drop(["quality"], axis=1)
Y= data["quality"]
X_train, X_test, Y_train, Y_test= train_test_split(X,Y, test_size= 0.2)
def encodage(df):
code= {"positive":1,
"negative":0,
"detected":1,
"not_detected":0}
for col in df.select_dtypes("object").columns:
df.loc[:,col]= df[col].map(code)
return df
encodage(X_train)
encodage(X_test)
model_test= DecisionTreeClassifier(random_state=0)
def evaluation(model):
model.fit(X_train,Y_train)
ypred= model.predict(X_test)
print(confusion_matrix(Y_test,ypred))
print(classification_report(Y_test,ypred))
numerical_features= make_column_selector(dtype_include= np.number)
categorical_features= make_column_selector(dtype_include= np.number)
numerical_pipeline= make_pipeline(SimpleImputer(), StandardScaler(), PolynomialFeatures(2, include_bias=False), SelectKBest(f_classif, k=10))
categorical_pipeline= make_pipeline(SimpleImputer(strategy="most_frequent"),OneHotEncoder()),SelectKBest(f_classif, k=10)
preprocessor = make_column_transformer((numerical_pipeline, numerical_features),(categorical_pipeline, categorical_features))
RandomForest= make_pipeline(preprocessor, RandomForestClassifier(random_state=0))
AdaBoost= make_pipeline(preprocessor, AdaBoostClassifier(random_state=0))
SVM= make_pipeline(preprocessor, StandardScaler(),SVC(random_state=0))
KNN =make_pipeline(preprocessor, StandardScaler(), KNeighborsClassifier())
dict_of_models= {"RandomForest": RandomForest, "AdaBoost": AdaBoost, "SVM": SVM, "KNN": KNN}
for name, model in dict_of_models.items():
print(name)
evaluation(model)
Everything was fine, I had a score of 0.66 with model_test(not visible here), but when I arrive at for name, model in... , I have this error:
TypeError: All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. '(Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
('onehotencoder', OneHotEncoder())]), SelectKBest())' (type <class 'tuple'>) doesn't.
isn't make_pipeline steps supposed to be list[] not tuple()
https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html
check out these other questions :
ColumnTransformer generating a TypeError when trying to fit_transform pipeline in sklearn
sklearn:TypeError: All estimators should implement fit and transform
Related
i am trying to create a pipeline that first impute missing data , do oversampling with the SMOTE and the the model
my code worked perfectly before i try smote not i cant find any solution
here is the code without smote
scoring = ['balanced_accuracy', 'f1_macro']
imputer = SimpleImputer(strategy='most_frequent')
pipeline = Pipeline(steps=[('i', imputer),('m', model)])
# define model evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_validate(pipeline, X, y, scoring=scoring, cv=cv, n_jobs=-1)
And here's the code after adding smote
Note: I tired importing make pipeline from imlearn
imputer = SimpleImputer(strategy='most_frequent')
pipeline = Pipeline(steps=[('i', imputer),('over', SMOTE()),('m', model)])
# define model evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_validate(pipeline, X, y, scoring=scoring, cv=cv, n_jobs=-1)
when i import Pipeline From SKLearn i got this error
All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'SMOTE()' (type <class 'imblearn.over_sampling._smote.base.SMOTE'>) doesn't
when i tried importing makepipeline from imlearn i get this error
Last step of Pipeline should implement fit or be the string 'passthrough'. '[('i', SimpleImputer(strategy='most_frequent')), ('over', SMOTE()), ('m', RandomForestClassifier())]' (type <class 'list'>) doesn't
Use the imblearn pipline:
from imblearn.pipeline import Pipeline
pipeline = Pipeline([('i', imputer),('over', SMOTE()),('m', model)])
I have a dataset that contains 17 features (x) and binary classification results (y). I already prepared the dataset and performed train_test_split() on it. I'm using the following script to run different ML algorithms on the dataset to compare between them:
def run_exps(X_train: pd.DataFrame , y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame) -> pd.DataFrame:
# Lightweight script to test many models and find winners
# :param X_train: training split
# :param y_train: training target vector
# :param X_test: test split
# :param y_test: test target vector
# :return: DataFrame of predictions
models = [
('LogReg', LogisticRegression()),
('RF', RandomForestClassifier()),
('KNN - Euclidean', KNeighborsClassifier(metric='euclidean')),
('SVM', SVC()),
('XGB', XGBClassifier(use_label_encoder =False, eval_metric='error'))
]
names = []
scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc']
# For Loop that takes each model and perform training, cross validation, prediction and evaluation
for name, model in models:
# Making pipleline that normalize, oversmaple the dataset
pipe = Pipeline([
('normalization', MinMaxScaler()),
('oversampling', SMOTE())
])
kfold = StratifiedKFold(n_splits=5)
# How can I call the pipeline inside the cross_validate() Function ?
cv_results = cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring, verbose=3)
clf = model.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('''
{}
{}
{}
''' .format(name, classification_report(y_test, y_pred), confusion_matrix(y_test, y_pred)))
names.append(name)
I have noticed that the data that I'm using needs to be normalized and oversampled before I run the script.
However, since I'm using cross_validate() function inside the script, I need to perform normalization and oversampling with each fold.
In order to do so I have created a pipeline (that normalizes and oversamples the dataset) inside the for loop (that takes each model and perform training, cross validation, prediction and evaluation) but I'm not sure how to call the pipeline since the estimator parameter in cross_validate() already takes the model variable to perform the prediction based on it.
What should I do in this case ?
You could integrate your model within your pipeline and then call cross_validate on your pipeline as follow:
pipe = Pipeline([
('normalization', MinMaxScaler()),
('oversampling', SMOTE()),
('name', model)
])
cv_results = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, verbose=3)
I trained a LinearSVC classifier with a NER dataset (https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus) and would like for it to be able to predict new data. From what I have read I need to create and save the model as a pipeline to do this. I have been trying to do this based on other examples on SO but can't get it to work. How can I turn my existing model into a pipelined version?
The first code snippet saves, and the second is one of my attempts at making it into the pipeline but I get an 'str' object has no attribute 'items' error. I think it has to do with the to_dict process but don't know how to replicate this in a pipelined version, can anyone help.
dframe = pd.read_csv("ner.csv", encoding = "ISO-8859-1", error_bad_lines=False)
dframe.dropna(inplace=True)
dframe[dframe.isnull().any(axis=1)].size
x_df = dframe.drop(['Unnamed: 0', 'sentence_idx', 'tag'], axis=1)
vectorizer = DictVectorizer()
X = vectorizer.fit_transform(x_df.to_dict("records"))
y = dframe.tag.values
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
model = LinearSVC(loss="squared_hinge",C=0.5,class_weight='balanced',multi_class='ovr')
model.fit(x_train, y_train)
dump(model, 'filename.joblib')
dframe = pd.read_csv("ner.csv", encoding = "ISO-8859-1", error_bad_lines=False)
dframe.dropna(inplace=True)
dframe[dframe.isnull().any(axis=1)].size
x_df = dframe.drop(['Unnamed: 0', 'sentence_idx', 'tag'], axis=1)
y = dframe.tag.values
x_train, x_test, y_train, y_test = train_test_split(x_df, y, test_size=0.1, random_state=0)
pipe = Pipeline([('vectorizer', DictVectorizer(x_df.to_dict("records"))), ('model', LinearSVC)])
pipe.fit(x_train, y_train)
You have to adjust your second part like this:
dframe = pd.read_csv("ner.csv", encoding = "ISO-8859-1", error_bad_lines=False)
dframe.dropna(inplace=True)
dframe[dframe.isnull().any(axis=1)].size
x_df = dframe.drop(['Unnamed: 0', 'sentence_idx', 'tag'], axis=1)
y = dframe.tag.values
x_train, x_test, y_train, y_test = train_test_split(x_df.to_dict("records"), y, test_size=0.1, random_state=0)
pipe = Pipeline([('vectorizer', DictVectorizer()), ('model', LinearSVC(loss="squared_hinge",C=0.5,class_weight='balanced',multi_class='ovr'))])
pipe.fit(x_train, y_train)
You were trying to pass your DictVectorizer() your data in the parameters by using
DictVectorizer(x_df.to_dict("records"))
but that does not work. The only available parameters for the DictVectorizer can be found here in the documentation.
And the second mistake was that you tried to fit your DictVectorizer() in the pipeline with the data from x_df with
pipe.fit(x_train, y_train)
The problem here is that the x_train data will be given to your DictVectorizer(), but x_train is just the split x_df and earlier in your code without the pipeline, you provided the DictVectorizer() with the data in form of x_df.to_dict("records").
So you need to pass the same type of data also with your pipeline. Thats why I already split the x_df.to_dict("records") with the train_test_split() in the adjusted code, so that the vectorizer can process it.
Last thing is that you also forgot the brackets when defining your pipeline for the LinearSVC()
('model', LinearSVC)
I want to use RFE for feature selection in a pipeline. I have no problems getting it to work in pipelines without GridSearch. However, when I try to incorporate GridSearch, I keep getting a value error (NB. the models are fine without RFE).
I have tried to use feature_selection as was suggested in this topic: Grid Search with Recursive Feature Elimination in scikit-learn pipeline returns an error, but this results in the same error.
What could be wrong?
my error:
ValueError: Invalid parameter alpha for estimator RFE(estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
normalize=True, random_state=None, solver='auto',
tol=0.001),
n_features_to_select=4, step=1, verbose=1). Check the list of available parameters with estimator.get_params().keys().
this works fine:
rfe=RFE(estimator=LinearRegression(), n_features_to_select=4, verbose=1)
#setup the pipeline steps
steps = [('scaler', StandardScaler()),
('imputation', SimpleImputer(missing_values = np.NaN, strategy='most_frequent')),
('reg', rfe)]
# Create the pipeline: pipeline
pipeline = Pipeline(steps)
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit the pipeline to the training set:
pipeline.fit(X_train, y_train)
# Predict the labels of the test set
y_pred = pipeline.predict(X_test)
print()
# Print the features and their ranking (high = dropped early on)
print(dict(zip(X.columns, rfe.ranking_)))
# Print the features that are not eliminated
print(X.columns[rfe.support_])
print()
print("R^2: {}".format(pipeline.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))
this doesn't work
rfe=RFE(estimator=Ridge(normalize=True), n_features_to_select=4, verbose=1)
#setup the pipeline steps
steps = [('scaler', StandardScaler()),
('imputation', SimpleImputer(missing_values=np.NaN, strategy='most_frequent')),
('ridge', rfe)]
# Create the pipeline: pipeline
pipeline = Pipeline(steps)
#Define hyperparameters and range of Grid Search
parameters = {"ridge__alpha": np.linspace(0,1,100)}
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# run cross validation
cv = GridSearchCV(pipeline, param_grid = parameters, cv=3)
# Fit the pipeline to the training set:
cv.fit(X_train, y_train)
# Predict the labels of the test set
y_pred = cv.predict(X_test)
# Compute and print R^2 and RMSE
print("R^2: {}".format(cv.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))
print("Tuned Model Parameters: {}".format(cv.best_params_))
using feature_selection also doesn't work
selector = feature_selection.RFE(Ridge(normalize=True))
#setup the pipeline steps
steps = [('scaler', StandardScaler()),
('imputation', SimpleImputer(missing_values=np.NaN, strategy='most_frequent')),
('RFE', selector)]
# Create the pipeline: pipeline
pipeline = Pipeline(steps)
The question is old, but in case someone stumbles upon it:
You can access the hyperparameter alpha or any parameter of the estimator inside feature_selection(estimator=) with the parameter '<feature_selection>__estimator__<your parameter>':
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.feature_selection import RFE
model = RFE(estimator=Ridge())
pipe = Pipeline(
steps = [
("scaler", StandardScaler()),
("rfe", model)
]
)
param = {
"rfe__step" : np.linspace(0.1, 1, 10),
"rfe__estimator__alpha" : np.logspace(-3, 3, 7)
}
tscv = TimeSeriesSplit(n_splits=5).split(X_train)
gridsearch = GridSearchCV(estimator=pipe, cv=tscv, param_grid=param, refit=True, return_train_score=True, n_jobs=-1)
fit = gridsearch.fit(X_train, y_train)
I am using the following method to train a linear regressor to predict retweets of tweets. I am using 'text' as the feature and 'retweet_count' as the target to be predicted. However, I have several additional features in my data such as hasMedia, hasHashtag, followers_count, sentiment (which are numerical features). How can I combine these features with 'text' that was converted to tfidf vector?
I already tried concatenating pandas dataframes. And then When I give a new test data the features mismatch. Please check my question in Attributes mismatch between training and testing data in sklearn - linear regression
def predict_retweets(dataset):
tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, stop_words='english', lowercase=False)
keyword_response = tfidf.fit_transform(dataset['text']).toarray()
X = keyword_response
y = dataset['retweet_count']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(df)
return None
Sample of data