Cross-validation for Random forest to select important feature

Cross-validation for Random forest to select important feature - python

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
import pandas as pd
data = load_iris()
df = pd.DataFrame(data['data'], columns=data['feature_names'])
df['target'] = data['target']
X = df.drop(columns=['target'])
y = df['target']
clf = RandomForestClassifier(n_estimators = 50, max_depth = 4)
scores = []
print(len(X.columns))
num_features = len(X.columns)
for i in range(num_features):
col = X.columns[i]
score = np.mean(cross_val_score(clf, X[col].values.reshape(-1,1), y, cv=10))
scores.append((int(score*100), col))
print(sorted(scores, reverse = True))
I intended to perform 10-fold cross-validation to select most important features. I am confused with my approach. It doesn't seems right! Also, how can I plot those most important features. I appreciate your suggestions!

Related

NLP classification with sparse and numerical features crashes

I have a dataset of 10 million english shows, which has been cleaned and lemmatized, and their classification into different category types such as comedy, documentary, action, ... etc
I also have a feature called duration, which is the length of the tv show.
Data can be found here
I perform tfidf vectorization on the titles, which returns a sparse matrix and normalization on the duration column.
Then I want to feed the data to a logistic regression classifier.
side question: I want to know if theres a better way to handle combining a sparse matrix and a numerical column
when I try to do it using todense() or toarray(), It works
When i pass it to the logistic regression function, the notebook crashes. But if i dont have the duration col, which means i dont have to apply the toarray() or todense() function, it works perfectly. Is this a memory issue?
This is my code:
import os
import pandas as pd
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
def normalize(df, col = ''):
mms = MinMaxScaler()
mms_col = mms.fit_transform(df[[col]])
return mms_col
def tfidf(X, col = ''):
tfidf_vectorizer = TfidfVectorizer(max_df = 0.8, max_features = 10000)
return tfidf_vectorizer.fit_transform(X[col])
def get_training_data(df):
df = shuffle(pd.read_csv(df).dropna())
data = df[['name_title', 'Duration']]
X_duration = normalize(data, col = 'Duration')
X_sparse = tfidf(data, col = 'name_title')
X = pd.DataFrame(X_sparse.toarray())
X['Duration'] = X_duration
y = df['target']
return X, y
def logistic_regression(X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y)
lr = LogisticRegression(C = 100.0, random_state = 1, solver = 'lbfgs', multi_class = 'ovr')
lr.fit(X_train, y_train)
y_predict = lr.predict(X_test)
print(y_predict)
print("Logistic Regression Accuracy %.3f" %metrics.accuracy_score(y_test, y_predict))
data_path = '../data/'
X, y = get_training_data(os.path.join(data_path, 'podcasts_en_processed.csv'))
print(X.shape) # this prints (971426, 10001)
logistic_regression(X, y)

Explication cross_val_score scikit_learn parameter cv

I don't understand why i have different result in this configuration of cross_val_score
and a simple model.
from sklearn.datasets import load_iris
from sklearn.utils import shuffle
from sklearn import tree
import numpy as np
np.random.seed(1234)
iris = load_iris()
X, y = iris.data, iris.target
X,y = shuffle(X,y)
print(y)
clf = tree.DecisionTreeClassifier(max_depth=2,class_weight={2: 0.3, 1: 10,0:0.3},random_state=1234)
clf2 = clf.fit(X, y)
tree.plot_tree(clf2)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
predi = clf2.predict(X)
cm = confusion_matrix(y_true=y, y_pred=predi)
print(cm)
print("Accuracy = ",round(accuracy_score(y,predi)* 100.0,2))
from sklearn.model_selection import cross_val_score,cross_val_predict
max_id = len(X)
limit = round(max_id*0.6,0)
min_id=0
train = np.arange(0,limit)
test = np.arange(limit,max_id)
test = [int(x) for x in test]
train = [int(x) for x in train]
print(train)
print(test)
predi = cross_val_score(clf,X,y,cv=[(train,test)])
print(predi)
train = X[train[0]:train[-1]]
y_train = y[train[0]:train[-1]]
Xtest = X[test[0]:test[-1]]
y_test = y[test[0]:test[-1]]
clf3 = clf.fit(Xtrain,y_train)
predi = clf3.predict(Xtest)
cm = confusion_matrix(y_true=y_test, y_pred=predi)
print(cm)
print("Accuracy = ",round(accuracy_score(y_test,predi)* 100.0,2))
I don't understand why i have different accuracy whereas i have the same parameters en the same train test sample

Basically, the kind of data split you use will have an impact on your model accuracy. This is well documented in machine learning field. Secondly, your first model is strictly biased as you have used your training set for testing which will result in ~100% accuracy.
https://www.analyticsvidhya.com/blog/2021/05/4-ways-to-evaluate-your-machine-learning-model-cross-validation-techniques-with-python-code/
https://towardsdatascience.com/train-test-split-c3eed34f763b

100% accuracy in random forest and 94% accuracy in KNN?

I was going to classify into the following dataset.
Dataset
I achieved amazingly high results (too high) and I think I had to do something wrong.
I'm trying to make a clasification for the age group based on the rest of the characteristics. I know there is a large correlation between variables and I will have to fix it all but for now I wanted to use everything as dependent variables.
Here is the code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
dataset = pd.read_csv('abalone.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1:].values
dataset["Sex"]= dataset["Sex"].replace('M', 0)
dataset["Sex"]= dataset["Sex"].replace('F', 1)
dataset["Sex"]= dataset["Sex"].replace('I', 2)
dataset["Age group"]= dataset["Age group"].replace('young abalone', 0)
dataset["Age group"]= dataset["Age group"].replace('middle-aged abalone', 1)
dataset["Age group"]= dataset["Age group"].replace('mature abalone', 2)
dataset["Age group"]= dataset["Age group"].replace('senior abalone', 3)
dataset['Age group'] = dataset['Age group'].astype(int).astype(float)
dataset['Rings'] = dataset['Rings'].astype(int).astype(float)
dataset['Sex'] = dataset['Sex'].astype(int).astype(float)
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1:].values
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
X[:, 0] = labelencoder.fit_transform(X[:, 0])
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()
# Avoiding the Dummy Variable Trap
X = X[:, 1:]
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm
## KNN
# Fitting K-NN to the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier2 = KNeighborsClassifier(n_neighbors = 7, metric = 'minkowski', p = 2)
classifier2.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier2.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm
I put all the code because I don't know exactly what went wrong.
I haven't checked yet other accuracy measures because the confusion matrix itself says something probably went wrong

Make prediction from Pandas DataFrame

I am very new to DataScience/Pandas in general. I mainly followed this and could get it to work using different classifiers.
import pandas as pd
import src.helper as helper
import time
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
# Headings
headings = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing',
'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
# Load the data
shrooms = pd.read_csv('data/shrooms_no_header.csv', names=headings, converters={"header": float})
# Replace the ? in 'stalk-root' with 0
shrooms.loc[shrooms['stalk-root'] == '?', 'stalk-root'] = np.nan
shrooms.fillna(0, inplace=True)
# Remove columns with only one unique value
for col in shrooms.columns.values:
if len(shrooms[col].unique()) <= 1:
print("Removing column {}, which only contains the value: {}".format(col, shrooms[col].unique()[0]))
shrooms.drop(col, axis=1, inplace=True)
# Col to predict later
col_predict = 'class'
# Binary Encoding
all_cols = list(shrooms.columns.values)
all_cols.remove(col_predict)
helper.encode(shrooms, [col_predict])
# Expand Shrooms DataFrame to Binary Values
helper.expand(shrooms, all_cols)
# Remove the class we want to predict
x_all = list(shrooms.columns.values)
x_all.remove(col_predict)
# Set Train/Test ratio
ratio = 0.7
# Split the DF
df_train, df_test, X_train, Y_train, X_test, Y_test = helper.split_df(shrooms, col_predict, x_all, ratio)
# Try different classifier
# TODO: Batch Use to compare
classifier = GradientBoostingClassifier(n_estimators=1000)
# TODO: Optimize Hyperparamter (where applicable)
# Time the training
timer_start = time.process_time()
classifier.fit(X_train, Y_train)
timer_stop = time.process_time()
time_diff = timer_stop - timer_start
# Get the score
score_train = classifier.score(X_train, Y_train)
score_test = classifier.score(X_test, Y_test)
print('Train Score {}, Test Score {}, Time {}'.format(score_train, score_test, time_diff))
# TODO: Test a manual DataFrame
The "helpers" are functions I don't quite understand fully, but they work:
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
def split_df(df, y_col, x_cols, ratio):
"""
This method transforms a dataframe into a train and test set, for this you need to specify:
1. the ratio train : test (usually 0.7)
2. the column with the Y_values
"""
mask = np.random.rand(len(df)) < ratio
train = df[mask]
test = df[~mask]
y_train = train[y_col].values
y_test = test[y_col].values
x_train = train[x_cols].values
x_test = test[x_cols].values
return train, test, x_train, y_train, x_test, y_test
def encode(df, columns):
for col in columns:
le = LabelEncoder()
col_values_unique = list(df[col].unique())
le_fitted = le.fit(col_values_unique)
col_values = list(df[col].values)
le.classes_
col_values_transformed = le.transform(col_values)
df[col] = col_values_transformed
def expand(df, list_columns):
for col in list_columns:
colvalues = df[col].unique()
for colvalue in colvalues:
newcol_name = "{}_is_{}".format(col, colvalue)
df.loc[df[col] == colvalue, newcol_name] = 1
df.loc[df[col] != colvalue, newcol_name] = 0
df.drop(list_columns, inplace=True, axis=1)
def correlation_to(df, col):
correlation_matrix = df.corr()
correlation_type = correlation_matrix[col].copy()
abs_correlation_type = correlation_type.apply(lambda x: abs(x))
desc_corr_values = abs_correlation_type.sort_values(ascending=False)
y_values = list(desc_corr_values.values)[1:]
x_values = range(0, len(y_values))
xlabels = list(desc_corr_values.keys())[1:]
fig, ax = plt.subplots(figsize=(8, 8))
ax.bar(x_values, y_values)
ax.set_title('The correlation of all features with {}'.format(col), fontsize=20)
ax.set_ylabel('Pearson correlatie coefficient [abs waarde]', fontsize=16)
plt.xticks(x_values, xlabels, rotation='vertical')
plt.show()
I would like to have a "manual" test, such as entering x attributes and getting a prediction based on that.
So for example, I hardcode a DataFrame like the following:
manual = pd.DataFrame({
"cap-shape": ["x"],
"cap-surface": ["s"],
"cap-color": ["n"],
"bruises": ["f"],
"odor": ["n"],
"gill-attachment": ["a"],
"gill-spacing": ["c"],
"gill-size": ["b"],
"gill-color": ["y"],
"stalk-shape": ["e"],
"stalk-root": ["?"],
"stalk-surface-above-ring": ["s"],
"stalk-surface-below-ring": ["s"],
"stalk-color-above-ring": ["o"],
"stalk-color-below-ring": ["o"],
"veil-type": ["p"],
"veil-color": ["o"],
"ring-number": ["o"],
"ring-type": ["p"],
"spore-print-color": ["o"],
"population": ["c"],
"habitat": ["l"]
})
How would I apply the same encoding? My code says helper.encode(manual, [col_predict]) but the manual ofc does not have a col_predict?
Please bear in mind I am a complete beginner, I searched the web a l ot, but I cannot come up with a proper source/tutorial that lets me test a single set.
The full code can be found here.

Try this:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
data = pd.read_csv('agaricus-lepiota.data.txt', header=None) #read data
data.rename(columns={0: 'y'}, inplace = True) #rename predict column (edible or not)
le = LabelEncoder() # encoder to do label encoder
data = data.apply(lambda x: le.fit_transform(x)) #apply LE to all columns
X = data.drop('y', 1) # X without predict column
y = data['y'] #predict column
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = GradientBoostingClassifier()#you can pass arguments
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test) #it is predict for objects in test
print(accuracy_score(y_test, y_pred)) #check accuracy
I think you can read more about this in sklearn site.
Is this example what you want?
To check your manual data:
manual = manual.apply(lambda x: le.fit_transform(x))
clf.predict(manual)

Scikit-Learn: Adjust train_size or test_size?

This is a question regarding best practices for sklearn.
While experimenting with SVMs using the iris dataset provided in the sklearn library. While using train_test_split, I was wondering which parameter to adjust to avoid overfitting. I was taught to adjust test_size (roughly to ~0.3), but there is a train_size parameter. Would it not make sense to adjust the train_size to avoid overfitting, or am I misunderstanding something here?
I get similar results regardless of which parameter I adjust, but I don't know if that's always the case.
Appreciate any help. Thanks!
Here is the code I am currently working with:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
scaler = StandardScaler()
scaler.fit(df)
scaled_df = scaler.transform(df)
df = pd.DataFrame(data=scaled_df, columns=iris.feature_names)
x = df
y = iris.target
#test_size is used here, but is swapped with train_size to experiment
x_train, x_test, y_train, y_test = tts(x, y, test_size=0.33)
c_param = np.arange(1, 100, 10)
gamma_param = np.arange(0.0001, 1, 0.001)
params = {'C':c_param, 'gamma':gamma_param}
grid = GridSearchCV(estimator=SVC(), param_grid=params, verbose=0)
grid_fit = grid.fit(x_train, y_train)
grid_pred = grid.predict(x_test)
print(grid.best_params_)
print('\n')
print("Number of training records: ", len(x_train))
print("Number of test records: ", len(x_test))
print('\n')
print(classification_report(y_true=y_test, y_pred=grid_pred))
print('\n')
print(confusion_matrix(y_true=y_test, y_pred=grid_pred))

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Cross-validation for Random forest to select important feature - python

Related

NLP classification with sparse and numerical features crashes

Explication cross_val_score scikit_learn parameter cv

100% accuracy in random forest and 94% accuracy in KNN?

Make prediction from Pandas DataFrame

Scikit-Learn: Adjust train_size or test_size?

Categories

Resources