The number of rows is close to 1 million. I define the models I want to test as such
X = df_final[['short_description', 'details', 'root_cause']]
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)
tfidf_pipeline = Pipeline([
('tfidf' ,TfidfVectorizer(max_features=1500, ngram_range=(1, 3), stop_words = 'english', strip_accents= 'ascii',))])
countvec_pipeline = Pipeline([
('countvec' ,CountVectorizer(max_features=1500, ngram_range=(1, 1), stop_words = 'english', strip_accents= 'ascii', binary = True))])
preprocessor_pipeline = ColumnTransformer(
transformers=[
('short_description', countvec_pipeline,'short_description'),
('details', tfidf_pipeline,'details'),
('root_cause', countvec_pipeline, 'root_cause'),
])
models = [
('rf', RandomForestClassifier(n_estimators=100,
max_depth=3,
random_state=0,
n_jobs = -1)),
('svc',LinearSVC()),
('nb', MultinomialNB()),
('lr', LogisticRegression(random_state=0,
solver = 'saga',
n_jobs = -1))
]
I then do a fit_transform as so
X_prepped = preprocessor_pipeline.fit_transform(X)
The below part is where my kernel once it hits the first model being random forest it breaks.
for model_name, model in models:
print(model_name)
results_dict = cross_validate(model, X_prepped, labels, cv = cv, scoring = 'accuracy', return_train_score = True)
results_df = pd.DataFrame(results_dict)
Is there something I am doing wrong here?
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import nltk
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_selection import chi2, SelectKBest
import spacy
from sklearn.preprocessing import MaxAbsScaler
df=pd.read_csv('input_data.csv')
NUMERIC=['Cost','Field2']
TEXT=['Text1', 'Text2']
def combine_text_columns(data_frame, text_labels=TEXT):
""" converts all text in each row of data_frame to single vector """
#to_drop = set(to_drop) & set(data_frame.columns.tolist())
#text_dta=data_frame.drop(to_drop,axis=1)
text_data =data_frame[text_labels]
# Replace nans with blanks
text_data.fillna("",inplace=True)
# Join all text items in a row that have a space in between
return text_data.apply(lambda x: " ".join(x), axis=1)
nlp = spacy.load('en_core_web_sm')
stopwords=spacy.lang.en.stop_words.STOP_WORDS
chi_k = 300
# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'
get_text_data = FunctionTransformer(combine_text_columns,validate=False)
# Preprocess the numeric data: get_numeric_data
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC], validate=False)
pl = Pipeline([
('union', FeatureUnion(
transformer_list = [
('numeric_features', Pipeline([
('selector', get_numeric_data),
('imputer', SimpleImputer())
])),
('text_features', Pipeline([
('selector', get_text_data),
('vectorizer', TfidfVectorizer(stop_words='english')),
#('vectorizer',CountVectorizer(stop_words=stopwords,token_pattern=TOKENS_ALPHANUMERIC, ngram_range=(1,2))),('dim_red', SelectKBest(chi2, chi_k))
]))
]
)),
('scale', MaxAbsScaler()),
#('svc',LinearSVC())
#('nb', MultinomialNB()),
#('clf', OneVsRestClassifier(RandomForestClassifier(n_estimators=15)))
('lr', OneVsRestClassifier(LogisticRegression(C=100)))
])
TARGET=['IsApartment']
sc_X = StandardScaler()
encoder=LabelEncoder()
label_enc=pd.Series(encoder.fit_transform(df['OpportunityName']))
NUMERIC2=[]
[NUMERIC2.append(x) for x in NUMERIC]
NUMERIC2.append('opportunityName_enc')
X2=df[NUMERIC]
X2=pd.concat([X2,label_enc],axis=1)
X2 = pd.DataFrame(sc_X.fit_transform(X2),columns=NUMERIC2)
X=pd.concat([X,X2],axis=1)
X=pd.concat([X,df[TEXT]],axis=1)
y=df[TARGET]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.3, random_state=42)
pl.fit(X_train,y_train)
predictions=pl.predict(X_train)
accuracy = pl.score(X_train, y_train)
print("\nAccuracy on sample data - numeric, no nans:{:.2f}% ".format(accuracy))
ytrain_pred_probas = pl.predict_proba(X_train)[:, 1]
# prob of predict as 1
fpr, tpr, thresholds = roc_curve(y_train, ytrain_pred_probas) # precision_recall_curve
roc = pd.DataFrame({'FPR':fpr,'TPR':tpr,'Thresholds':thresholds})
_ = plt.figure()
plt.plot(roc.FPR, roc.TPR)
plt.axvline(0.1, color = '#00C851', linestyle = '--')
plt.xlabel("FPR")
plt.ylabel("TPR")
Related
I have the following toy code.
I use a pipeline to automatically normalize numerical variables and apply one-hot-encoding to the categorical ones.
I can get the coefficients of the linear regression model easily using pipe['logisticregression'].coef_ but how can I get all the feature names in the right order as this appearing in the coef matrix?
from sklearn.compose import ColumnTransformer
import numpy as np, pandas as pd
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
# data from https://www.kaggle.com/datasets/uciml/adult-census-income
data = pd.read_csv("adult.csv")
data = data.iloc[0:3000,:]
target = "workclass"
y = data[target]
X = data.drop(columns=target)
numerical_columns_selector = make_column_selector(dtype_exclude=object)
categorical_columns_selector = make_column_selector(dtype_include=object)
numerical_columns = numerical_columns_selector(X)
categorical_columns = categorical_columns_selector(X)
ct = ColumnTransformer([ ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_columns) ,
('std', StandardScaler(), numerical_columns)])
model = LogisticRegression(max_iter=500)
pipe = make_pipeline(ct, model)
data_train, data_test, target_train, target_test = train_test_split(
X, y, random_state=42)
pipe.fit(data_train, target_train)
pipe['logisticregression'].coef_.shape
My ML project is about "Loan Eligibility prediction"
For that I used data below : https://www.kaggle.com/code/sazid28/home-loan-prediction/data?select=train.csv
and my code is as shown :
import random
import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import \
SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import \
OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_selection import RFECV
df_train_original = pd.read_csv("train.csv.xls")
df = df_train_original.drop(df_train_original.columns[0], axis=1)
# Remplace 'Credit_History' by random value (0 or 1)
random.seed(0)
df['Credit_History'] = \
df['Credit_History'].apply(
lambda x: np.random.choice(df['Credit_History'].dropna().values)
if np.isnan(x) else x)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
# Data pre-processing
numerical_feature, categorical_feature = [], []
for i in X.columns:
if X[i].dtype == 'O':
categorical_feature.append(i)
else:
numerical_feature.append(i)
imputer = IterativeImputer(random_state=0)
scaler = StandardScaler()
encoder = OrdinalEncoder()
# Replace categorical features with the most frequent value of the column
# Gender (-13) , Married (-3), Self_Employed (-32)
# LoanAmount (-22) Loan_Amount_Term (-14) Credit_History (-50)
numerical_pipeline = make_pipeline(imputer, scaler)
categorical_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), encoder)
preprocessor = make_column_transformer((numerical_pipeline, numerical_feature),
(categorical_pipeline, categorical_feature),
remainder='passthrough')
clf = LogisticRegression(random_state=0, max_iter=df.shape[0])
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
random_state=0)
params = {
'logisticregression__class_weight': [None, 'balanced'],
'logisticregression__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
'logisticregression__C': np.linspace(0.001, 0.1, 30),
}
model = make_pipeline(preprocessor, clf)
selector = RFECV(model, step=1, min_features_to_select=2, cv=5)
selector.fit(X_train, y_train)
when I run the code I get :
ValueError: could not convert string to float: 'Male'
I think that the data is not fitted and transformed before going through RFECV.
How to fix this?
RFECV does not work with a pipeline as the estimator, as it requires the estimator to expose either a coef_ or a feature_importances_. Pipelines do not, and even if they did, there would be no guarantee that the feature importances of the final estimator correspond to the features input to the pipeline with arbitrary transformations in the intermediate.
What you can do is make the RFECV transformer an element of your pipeline between the preprocessing and the final estimator, ie
preprocessor = make_column_transformer((numerical_pipeline, numerical_feature),
(categorical_pipeline, categorical_feature),
remainder='passthrough')
clf_fs = LogisticRegression(random_state=0, max_iter=df.shape[0])
clf = LogisticRegression(random_state=0, max_iter=df.shape[0])
feature_selector = RFECV(clf_fs , step=1, min_features_to_select=2, cv=5)
model = make_pipeline(preprocessor, feature_selector, clf)
I am a novice and I start in the data and I try to predict thanks to a regression model the price of a house, a price exercise on kaggle file: train.csv , i'm going to explore data
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score , cross_val_predict, cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, LassoCV
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
%matplotlib inline
data = pd.read_csv('train.csv', index_col = 0)
pd.set_option('display.max_row',80)
pd.set_option('display.max_column',80)
hp = data.copy()
hp = hp.apply(lambda x: x.fillna(0) if x.dtype.kind in 'biufc' else x.fillna(hp.mean()))
hp.isna().sum()
from sklearn.preprocessing import StandardScaler
numeric_features = hp.select_dtypes(exclude = ['object'])
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
categorical_features = hp.select_dtypes('object')
categorical_features= categorical_features.fillna('Z')
categorical_transformer =
OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer,
categorical_features)])
y = numeric_features['SalePrice']
X = numeric_features.drop('SalePrice', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)
scaler = preprocessing.StandardScaler().fit(X)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
def rmse_cv(model):
rmse= np.sqrt(-cross_val_score(clf, X_train_scaled, y_train,
scoring="neg_mean_squared_error", cv = 5))
return(rmse)
How to implement my for loop and call my function to get a result in my Ridge regression model?
coefs = []
rmse = []
alphas = [0.01, 0.05 , 0.1, 0.3, 0.8, 1, 5,10, 15, 30,50]
for a in alphas:
ridge = Ridge(alpha=a, fit_intercept=True)
ridge.fit(X_train_scaled, y_train)
coefs.append(ridge.coef_)
rmse_cv()
In the documentation of sklearn, a method that returns the predictions have been included.
Another online source, smith.edu, explains how to use the method with an example.
Here is how you can integrate them into your solution:
coefs = []
rmse = []
alphas = [0.01, 0.05 , 0.1, 0.3, 0.8, 1, 5,10, 15, 30,50]
for a in alphas:
ridge = Ridge(alpha=a, fit_intercept=True)
ridge.fit(X_train_scaled, y_train)
pred = ridge.predict(X_test)
coefs.append(ridge.coef_)
rmse.append = mean_squared_error(y_test, pred) # Calculate the test MSE
I want to train a relatively large recordset. (200000 rows and 400 columns) in a pipeline. Only a weak notebook is available for the task.
This dataset has 15 independent classes and mixed categorical and numerical features. An SVM-like algorithm should be chosen.
I already tried to put some code together.
from sklearn.datasets import make_classification
from sklearn.preprocessing import LabelBinarizer,StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV
from sklearn.multiclass import OneVsRestClassifier
X, y= make_classification(n_samples=200000, n_features=130, n_informative=105,
n_redundant=25, n_classes=15, n_clusters_per_class=15)
#add some categorical columns
X [:,:2]= np.abs(X[:,:2]).astype(int)
X = pd.DataFrame(X, columns=[f'F{i}' for i in range(X.shape[1])])
cols = X.columns.tolist()
y = LabelBinarizer().fit_transform(y)
#%%Transformation
full_pipeline = ColumnTransformer([
('numerical', StandardScaler(), cols[2:]),
('categorical', OneHotEncoder(categories='auto'), cols[:2])
])
#Sparse matrix
X = full_pipeline.fit_transform(X)
#set start
rbf = RBFSampler(gamma=0.1, random_state=42)
semi_svm = SGDClassifier(loss="hinge", penalty="l2", max_iter=50)
clf_pipe = Pipeline([
('rbf', rbf),
('svm', semi_svm)
])
cv = StratifiedShuffleSplit(n_splits=5)
grid_search = RFECV(estimator=OneVsRestClassifier(clf_pipe), step=3, cv=cv,
scoring='accuracy', n_jobs=-1, verbose=10)
grid_search.fit(X, y)
ValueError: bad input shape (200000, 15)
How to handle the multiclass error in this case?
The following solution worked for me:
...
y = LabelEncoder().fit_transform(y)
...
rbf = RBFSampler(gamma=0.1, random_state=42)
semi_svm = OneVsOneClassifier(SGDClassifier(loss="hinge", penalty="l2", max_iter=5000))
selection = SelectKBest(k=1)
clf_pipe = Pipeline([
('rbf', rbf),
('features', selection ),
('svm', semi_svm)
])
cv = StratifiedShuffleSplit(n_splits=5)
param_grid = dict(features__k=np.logspace(1,6, num=5, base=2).round().astype(int),
rbf__gamma = [0.1,1])
grid_search = GridSearchCV(estimator=clf_pipe, cv=cv, param_grid = param_grid,
scoring='f1', n_jobs=-1, verbose=10)
I want to add an evaluation model using the cross-validation and confusion matrix k-fold (k = 10) method, but I'm confused
dataset : https://github.com/fadholifh/dats/blob/master/cpas.txt
Using Pyhon 3.7
import sklearn.metrics
import sen
import csv
import os
import re
import nltk
import scipy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
factorys = StemmerFactory()
stemmer = factorys.create_stemmer()
if __name__ == "__main__":
main()
the result is confusion matrix and for k-fold each fold has a percentage of F1-score, precission, and recall
df = pd.read_csv("cpas.txt", header=None, delimiter="\t")
X = df[1].values
y = df[0].values
stop_words = stopwords.words('english')
stemmer = PorterStemmer()
def clean_text(text, stop_words, stemmer):
return " ".join([stemmer.stem(word) for word in word_tokenize(text)
if word not in stop_words and not word.isnumeric()])
X = np.array([clean_text(text, stop_words, stemmer) for text in X])
kfold = KFold(3, shuffle=True, random_state=33)
i = 1
for train_idx, test_idx in kfold.split(X):
X_train = X[train_idx]
y_train = y[train_idx]
X_test = X[test_idx]
y_test = y[test_idx]
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
model = LinearSVC()
model.fit(X_train, y_train)
print ("Fold : {0}".format(i))
i += 1
print (classification_report(y_test, model.predict(X_test)))
The reason you use cross validation is for parameter tuning when the data is less. One can use grid search with CV to do this.
df = pd.read_csv("cpas.txt", header=None, delimiter="\t")
X = df[1].values
labels = df[0].values
text = np.array([clean_text(text, stop_words, stemmer) for text in X])
idx = np.arange(len(text))
np.random.shuffle(idx)
text = text[idx]
labels = labels[idx]
pipeline = Pipeline([
('vectorizer', TfidfVectorizer()),
('svm', LinearSVC())])
params = {
'vectorizer__ngram_range' : [(1,1),(1,2),(2,2)],
'vectorizer__lowercase' : [True, False],
'vectorizer__norm' : ['l1','l2']}
model = GridSearchCV(pipeline, params, cv=3, verbose=1)
model.fit(text, y)