Apply Cross-Validation to transformer classification

Apply Cross-Validation to transformer classification - python

I have a classification script using simpletransformers. I am using an imbalanced dataset with three labels (many 0, not so many 1 and 2), so the roberta classifier oftentimes decides to only predict 0 instead of the minority classes.
To get some overall estimation of the performance of the classifier I would like to use 10-fold cross validation instead of the train/test held-out data.
For this I am using the code below.
import pandas as pd
import numpy as np
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
df = pd.read_excel('Classifications_Output_NEW.xlsx')
df.shape
df["Q2"] = df["Q2"]. replace(np. nan,0)
df["Q2"] = df["Q2"].replace(3,0)
df["Q2"] = df["Q2"].replace(4,0)
# rename columns
df["text"] = df["SNIPPET"]
df["labels"] = df["Q2"]
# Function to replace the token before the sentiment word with the Q1 info
def replace_token_with_number(row):
string = row['SNIPPET']
number = row['Q1']
return string.replace('[[', "xxproj " + str(number) + " ")
# Apply the function to every row in the dataframe
df["text"] = df.apply(replace_token_with_number, axis=1)
# remove [+] and [-] from text column and replace with ++ and --
df["text"] = df["text"].str.replace("[+]", " xxpositive", regex=False)
df["text"] = df["text"].str.replace("[-]", " xxnegative", regex=False)
# remove [[ and ]] from text column
df["text"] = df["text"].str.replace("[[", "", regex=False)
df["text"] = df["text"].str.replace("]]", "", regex=False)
# replace the Q1 number with a string
#df["text"] = df["text"].str.replace("xxproj1.0", "xxyesPJ", regex=False)
#df["text"] = df["text"].str.replace("xxproj0.0", "xxnoPJ", regex=False)
# prepare cross validation
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import pandas as pd
n=10
kf = KFold(n_splits=n, shuffle=True)
results = []
for train_index, val_index in kf.split(df):
# splitting Dataframe (dataset not included)
train_df = df["text"][train_index]
val_df = df["labels"][val_index]
# Defining Model
model = ClassificationModel('roberta', 'roberta-base', num_labels=3, weight=class_weights.tolist(),
use_cuda=True, args={'reprocess_input_data': True, 'overwrite_output_dir': True,
"num_train_epochs": 10})
# train the model
model.train_model(train_df)
# validate the model
result, model_outputs, wrong_predictions = model.eval_model(val_df, acc=accuracy_score)
print(result['acc'])
# append model score
results.append(result['acc'])
print("results",results)
print(f"Mean-Precision: {sum(results) / len(results)}")
Using this code produces an AttributeError: 'Series' object has no attribute 'columns'. I believe it has to do with the way the script accesses the columns of my dataframe but I am not able to solve the error.
I am grateful for any advice!

Related

why smote raise "Found input variables with inconsistent numbers of samples"?

I try to classify emotion from tweet with dataset of 4401 tweet, when i use smaller sample of data (around 15 tweet) everything just work fine, but when i use the full dataset it raise the error of
Found input variables with inconsistent numbers of samples: [7, 3520]
the error happen when i try to oversampling the data using smote after transforming the data using countvectorizer.
This is the code where the error raise
# N-gram Feature and Term Frequency
vectorizer = CountVectorizer(ngram_range=(1,3))
x_train_tf = vectorizer.fit_transform(str(x_train).split('\n')).toarray()
x_test_tf = vectorizer.transform(str(x_test).split('\n')).toarray()
df_output = pd.DataFrame(data =x_train_tf, columns = vectorizer.get_feature_names_out())
display(df_output)
# the print shape is (7 rows × 250 columns)
smote = SMOTE(random_state=42, k_neighbors=5)
x_smote, y_smote = smote.fit_resample(x_train_tf, y_train)
print("Total Train Data SMOTE : ",x_smote.shape), print("Total Train Label SMOTE : ",y_smote)
i did not understand why this is happening so some explanation could really help.
i already tried to solve it using answers from other similiar question but nothing have worked.
This is the full code
import nltk
import re
#nltk.download()
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk import everygrams
from collections import Counter
from sklearn import preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
dataset = pd.read_csv("G:/TA/Program/dataset/Twitter_Emotion_Dataset.csv", encoding='latin-1')
# Preprocessing
dataset['case_folding_tweet'] = dataset['tweet'].str.casefold()
dataset['only_alphabet_tweet'] = [re.sub('[^a-zA-Z]+\s*', ' ', s) for s in dataset['case_folding_tweet']]
dataset['data_cleaning_tweet'] = dataset['only_alphabet_tweet'].str.replace(r'\b\w{1}\b','').str.replace(r'\s+', ' ')
slangword_dictionary = ("G:/TA/Program/dataset/kamus_singkatan.csv")
deslang = {}
list_slangword = open(slangword_dictionary).readlines()
for line in list_slangword:
slang, unslang = line.strip().split(';')
deslang[slang] = unslang
deslang[slang] = {r"\b{}\b".format(k): v for k, v in deslang.items()}
dataset['data_cleaning_tweet'] = dataset['data_cleaning_tweet'].replace(deslang[slang], regex=True)
dataset['convert_slang_tweet'] = dataset['data_cleaning_tweet']
replace_dictionary = {'tidak ': 'tidak', 'bukan ': 'bukan', 'jangan ': 'jangan', 'belum ': 'belum'}
dataset['convert_negation_tweet'] = dataset['convert_slang_tweet'].replace(replace_dictionary, regex=True)
dataset['tokenization_tweet'] = dataset['convert_negation_tweet'].apply(word_tokenize)
list_stopwords = set(stopwords.words("indonesian"))
list_stopwords.add('username')
list_stopwords.add('url')
dataset['stopword_removal_tweet'] = dataset['tokenization_tweet'].apply(lambda x: [item for item in x if item not in list_stopwords])
factory = StemmerFactory()
stemmer = factory.create_stemmer()
dataset['stemmed_tweet'] = dataset['stopword_removal_tweet'].apply(lambda x: [stemmer.stem(y) for y in x])
# Split data
x = dataset["stemmed_tweet"].values
y = dataset["label"].values
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state= 42)
# Get N-gram and TF
vectorizer = CountVectorizer(ngram_range=(1,3))
x_train_tf = vectorizer.fit_transform(str(x_train).split('\n')).toarray()
x_test_tf = vectorizer.transform(str(x_test).split('\n')).toarray()
# Oversampling
smote = SMOTE(random_state=42, k_neighbors=5)
x_smote, y_smote = smote.fit_resample(x_train_tf, y_train)
print("Total Train Data SMOTE : ",x_smote.shape), print("Total Train Label SMOTE : ",y_smote)
gnb_classifier = GaussianNB()
gnb_classifier.fit(x_smote, y_smote)
print(gnb_classifier)
y_pred = gnb_classifier.predict(x_test_tf)
print("Emotion Predicted :", y_pred)
Link to the dataset

I cannot solve it precisely because I don't have your data, but here are a few observations which should help:
apparently x_train_tf has only 7 rows? it's not enough for training a model and it's not 80% of 4401, as you're supposed to obtain from train_test_split.
Note that y_train has 3520 rows = 4401 * 80%, the correct number of rows.
I suspect that the line x_train_tf = vectorizer.fit_transform(str(x_train).split('\n')).toarray() is not doing what you think it does. Try to decompose the str(x_train).split('\n') part.

i fix the problem using the answer from this post answer
by joining all the train data column before vectorizing.
df_train = pd.DataFrame(data=x_train)
df_test = pd.DataFrame(data=x_test)
series = pd.Series(df_train['stemmed_tweet'])
corpus = series.apply(lambda series: ' '.join(series))
vectorizer = CountVectorizer(ngram_range=(1,3), lowercase=False)
x_train_tf = vectorizer.fit_transform(corpus).toarray()
x_test_tf = vectorizer.transform(str(df_test.values).split("\n")).toarray()

What can I do to change dot in comma?

Good morning! I'm new of python, I use Spyder 4.0 to build neural network.
In the script below I use the random forest in order to do feature importances. So the values importances are the ones that tell me what is the importance of each features. Unfortunatly I can't upload the dataset, but I can tell you that there are 18 features and 1 label, both are phisical quantyties and it's a regression problem.
I want to export in a excel file the variable importances, but when I do it (simply cooping the vector) the numbers are with the dot (eg 0.012, 0.015, .....ect). In order to use it in the excel file I prefere to have the comma instead of the dot.
I try to use .replace('.',',') but it doesn't works, the error is:
AttributeError: 'numpy.ndarray' object has no attribute 'replace'
It think that it happens because the vector importances is an Array of float64 (18,).
What can I do?
Thanks.`
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
dataset = pd.read_csv('Dataset.csv', decimal=',', delimiter = ";")
label = dataset.iloc[:,-1]
features = dataset.drop(columns = ['Label'])
y_max_pre_normalize = max(label)
y_min_pre_normalize = min(label)
def denormalize(y):
final_value = y*(y_max_pre_normalize-y_min_pre_normalize)+y_min_pre_normalize
return final_value
X_train1, X_test1, y_train1, y_test1 = train_test_split(features, label, test_size = 0.20, shuffle = True)
y_test2 = y_test1.to_frame()
y_train2 = y_train1.to_frame()
scaler1 = preprocessing.MinMaxScaler()
scaler2 = preprocessing.MinMaxScaler()
X_train = scaler1.fit_transform(X_train1)
X_test = scaler2.fit_transform(X_test1)
scaler3 = preprocessing.MinMaxScaler()
scaler4 = preprocessing.MinMaxScaler()
y_train = scaler3.fit_transform(y_train2)
y_test = scaler4.fit_transform(y_test2)
sel = RandomForestRegressor(n_estimators = 200,max_depth = 9, max_features = 5, min_samples_leaf = 1, min_samples_split = 2,bootstrap = False)
sel.fit(X_train, y_train)
importances = sel.feature_importances_
# sel.fit(X_train, y_train)
# a = []
# for feature_list_index in sel.get_support(indices=True):
# a.append(feat_labels[feature_list_index])
# print(feat_labels[feature_list_index])
# X_important_train = sel.transform(X_train1)
# X_important_test = sel.transform(X_test1)

I will try to show you an example of what you should do by using some random values. I ran this on the python shell that's why you see also the ">>>".
>>> import numpy as np # first I import numpy as "np"
# I generate 10 random values and I store them in "importance"
>>> importance=np.random.rand(10)
# here I just want to see the content of "importance"
>>> importance
array([0.77609076, 0.97746829, 0.56946118, 0.23986983, 0.93655692,
0.22003531, 0.7711095 , 0.36083248, 0.58277805, 0.57865248])
# here there is your error that I reproduce for teaching purpose
>>>importance.replace(".", ",")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
AttributeError: 'numpy.ndarray' object has no attribute 'replace'
What you need to to is to convert the elements of "importance" to a list of strings
>>> imp_astr=[str(i) for i in importance]
>>> imp_astr
['0.7760907642658763', '0.9774682868805988', '0.569461184647781', '0.23986982589422634', '0.9365569207431337', '0.22003531170279356', '0.7711094966708247', '0.3608324767276052', '0.5827780487688116', '0.5786524781334242']
# at the end, for each string, you can use the "replace" function
>>> imp_astr=[i.replace(".", ",") for i in imp_astr]
>>> imp_astr
['0,7760907642658763', '0,9774682868805988', '0,569461184647781', '0,23986982589422634', '0,9365569207431337', '0,22003531170279356', '0,7711094966708247', '0,3608324767276052', '0,5827780487688116', '0,5786524781334242']

Good practices - Sklearn Linear Regression with pandas

Is this the best way to work with pandas and vectorizer ? Converting a dataframe to a dict, vectorize and put all in a new dataframe? Or there is a better way to work with?
import pandas as pd
# Putting AmesHousing.txt data into a dataframe
data = pd.read_csv('AmesHousing.txt', encoding='UTF-8', delimiter='\t')
data = data.fillna(0)
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False, dtype=int)
df = pd.DataFrame(vec.fit_transform(data.T.to_dict().values()), columns = [vec.get_feature_names()])
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
#Here we are splitting our data with 2 pieces: train and test. Test will have 33% of data; train will have all the rest
test, train = train_test_split(df,test_size=0.33, random_state=42)
model = LinearRegression()
model.fit(train.drop(['SalePrice'], axis=1), train[['SalePrice']])
predict = model.predict(test.drop(['SalePrice'], axis=1))
MSE = mean_squared_error(predict,test[['SalePrice']])
RMSE = np.sqrt(MSE)
print('MSE:',MSE,'RMSE:',RMSE)

How can I forecast a y-variable based on multiple x-variables?

I'm testing code like this.
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate
#Seaborn for easier visualization
import seaborn as sns
# Load Iris Flower Dataset
# Load data
df = pd.read_csv('C:\\path_to_file\\train.csv')
df.shape
list(df)
# the model can only handle numeric values so filter out the rest
# data = df.select_dtypes(include=[np.number]).interpolate().dropna()
df1 = df.select_dtypes(include=[np.number])
df1.shape
list(df1)
df1.dtypes
df1 = df1.fillna(0)
#Prerequisites
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
#Split train/test sets
# y = df1.SalePrice
X = df1.drop(['index'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.33)
# Train model
clf = RandomForestRegressor(n_jobs=2, n_estimators=1000)
model = clf.fit(X_train, y_train)
# Feature Importance
headers = ['name', 'score']
values = sorted(zip(X_train.columns, model.feature_importances_), key=lambda x: x[1] * -1)
print(tabulate(values, headers, tablefmt='plain'))
(pd.Series(model.feature_importances_, index=X.columns)
.nlargest(10)
.plot(kind='barh'))
This works fine on some sample data that I found online. Now, rather than predicting a sales price as my y variable. I'm trying to figure out how to just get the model to make some kind of prediction like target = True or Target = False or maybe my approach is wrong.
It's a bit confusing for me, because of this line: df1 = df.select_dtypes(include=[np.number]). So, only numbers are included, which makes sense for a RandomForestRegressor classifier. I'm just looking for some guidance on how to deal with a non-numeric prediction here.

You are dealing with a classification problem here with 2 classes (True, False). To get started take a look at a simple logistic regression model.
https://en.wikipedia.org/wiki/Logistic_regression
Since you are using sklearn try:
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

Predicting Different Type of Dataset Instead Required One

I have Train and Test Dataset. I want to make prediction for my Test Dataset and save it as CSV. The problem is that,
I can't save the result of my Test Dataset. Everytime, I save the result of Training dataset.
That would be great if you tell me where I am missing.
Here is my codes.
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import random
from sklearn.metrics import roc_curve
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
Train = pd.read_csv('Dataset/train.csv', delimiter=';')
Test = pd.read_csv('Dataset/train.csv', delimiter=';')
Train['Type'] = 'Train' # Create a flag for Train and Test Data set
Test['Type'] = 'Test'
FullData = pd.concat([Train, Test], axis=0) # Combined both Train and Test Data set
ID_Col = ['USER_ID'] # ID Variables
Target_Col = ["ACTIVITY_DEC_16"]
Cat_Cols = ['ACT_DATE', 'STATUS', 'TP_CURRENT', 'TP_CHANGES_NUM', 'START_PACK', 'OFFER_GROUP', 'BIRTHDAY', 'GENDER',
'MLLS_STATE',
'PORTED_IN', 'PORTED_OUT', 'OBLIG_NUM', 'OBLIG_ON_START', 'ASSET_TYPE_LAST', 'DEVICE_TYPE_BUS',
'USAGE_AREA', 'REFILL_OCT_16', 'REFILL_NOV_16',
'OUTGOING_OCT_16', 'OUTGOING_NOV_16', 'GPRS_OCT_16', 'GPRS_NOV_16', 'REVENUE_OCT_16',
'REVENUE_NOV_16'] # Categorical Variables
Num_Cols = list(set(list(FullData.columns)) - set(Cat_Cols) - set(ID_Col) - set(Target_Col)) # Numerical Variables
Other_Col = ['Type'] # Test and Train Data Set Identifier
Num_Cat_Cols = Num_Cols + Cat_Cols # Combined numerical and Categorical variables
# Create a new variable for each variable having missing value with VariableName_NA
# and flag missing value with 1 and other with 0
for var in Num_Cat_Cols:
if FullData[var].isnull().any() == True:
FullData[var + '_NA'] = FullData[var].isnull() * 1
# Impute numercial missing values with mean
FullData[Num_Cols] = FullData[Num_Cols].fillna(FullData[Num_Cols].mean(), inplace=True)
FullData[Cat_Cols] = FullData[Cat_Cols].fillna(value=-9999)
# Create a label encoders for categorical variables and split the data set to train & test. Further split the train data set to Train and Validate
for var in Cat_Cols:
number = LabelEncoder()
FullData[var] = number.fit_transform(FullData[var].astype('str'))
# Target Variable is also a categorical so convert it
FullData["ACTIVITY_DEC_16"] = number.fit_transform((FullData["ACTIVITY_DEC_16"].astype('str')))
Train = FullData[FullData['Type'] == 'Train']
Test = FullData[FullData['Type'] == 'Test']
Train['is_train'] = np.random.uniform(0, 1, len(Train)) <= 0.75
Train, Validate = Train[Train['is_train'] == True], Train[Train['is_train'] == False]
Features = list(set(list(FullData.columns)) - set(ID_Col) - set(Target_Col) - set(Other_Col))
X_Train = Train[list(Features)]
Y_Train = Train["ACTIVITY_DEC_16"].values
X_Validate = Validate[list(Features)].values
Y_Validate = Validate["ACTIVITY_DEC_16"].values
X_Test = Test[list(Features)].values
random.seed(100)
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(X_Train, Y_Train)
Status = rf.predict_proba(X_Validate)
fpr, tpr, _ = roc_curve(Y_Validate, Status[:, 1])
roc_auc = metrics.auc(fpr, tpr)
Final_Status = rf.predict_proba(X_Test)
print(Final_Status)
Test['ACTIVITY_DEC_16_PROB'] = Final_Status[:, 1]
Test.to_csv('/Users/isozyesil/PycharmProjects/TaskNo2/Dataset/Output.csv', columns=['USER_ID', 'ACTIVITY_DEC_16_PROB'])

Reaching the Training Data again instead test.
Train = pd.read_csv('Dataset/train.csv', delimiter=';')
Test = pd.read_csv('Dataset/test.csv', delimiter=';')

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Apply Cross-Validation to transformer classification - python

Related

why smote raise "Found input variables with inconsistent numbers of samples"?

What can I do to change dot in comma?

Good practices - Sklearn Linear Regression with pandas

How can I forecast a y-variable based on multiple x-variables?

Predicting Different Type of Dataset Instead Required One

Categories

Resources