How to compile fitted model for coremltools? - python

I have made my machine model and need to upload it to Xcode using coremltools. I originally used sklearn ensemble as my machine model but coreml does not support it, so I decided to use either LinearSVC or LogisticRegression, which have the highest accuracy after training.
import numpy as np
import pandas as pd
#load the dataset of the file
#FYI:use quotation marks to escape comma or just not use the sentences
df = pd.read_csv('RhetoricalDevices1.csv', error_bad_lines=False, delimiter= ',', engine='python')
#print useful information about the data set
df.info()
df.head()
#check class distribution--number of each device uploaded
classes1 = df['Rhetorical Devices1']
classes2 = df['Rhetorical Devices2']
from sklearn.preprocessing import LabelEncoder
encoder1 = LabelEncoder()
encoder2 = LabelEncoder()
Y1 = encoder1.fit_transform(classes1.fillna('0'))
Y2 = encoder2.fit_transform(classes2.fillna('0'))
print(encoder1.inverse_transform([6]))
import nltk
from nltk.tokenize import word_tokenize
#creating a bag-of-words model
all_words = []
for sentences in processed:
words = word_tokenize(sentences)
for w in words:
all_words.append(w)
all_words = nltk.FreqDist(all_words)
# use the 2000 most common words as features
word_features = list(all_words.keys())[:2000]
#define a find_feature function
def find_features(sentence):
words = word_tokenize(sentence)
features = {}
for word in word_features:
features[word] = (word in words)
return features
#find features for all sentences
sentences = list(zip(processed, Y1))
#define a seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(sentences)
#call find_features function for each sentence
featuresets = [(find_features(text), label) for (text, label) in sentences]
# split training and testing data sets using sklearn
from sklearn import model_selection
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state = seed)
names = ['K Nearest Neighbors','Decision Tree','Random Forest','Logistic Regression','SGDClassifier','Multinomial','One Vs Rest Classifier']
classifiers = [
KNeighborsClassifier(n_jobs = -1),
DecisionTreeClassifier(class_weight = 'balanced'),
RandomForestClassifier(),
LogisticRegression(),
SGDClassifier(max_iter = 100, class_weight ='balanced', n_jobs = -1),
MultinomialNB(),
#GaussianProcessClassifier(),
LinearSVC()
]
models = list(zip(names, classifiers))
from nltk.classify.scikitlearn import SklearnClassifier
for name, model in models:
nltk_model = SklearnClassifier(model)
nltk_model.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("{} Accuracy: {}".format(name, accuracy))
when I tried following code, I get error "TypeError: Expected a 'fitted' model for conversion". How should I fix this?
model = LinearSVC()
coreml_model = coremltools.converters.sklearn.convert(model, 'Samples','Rhetorical Devices')

You should call fit() on your model with your training data, before converting it to CoreML.

Related

why smote raise "Found input variables with inconsistent numbers of samples"?

I try to classify emotion from tweet with dataset of 4401 tweet, when i use smaller sample of data (around 15 tweet) everything just work fine, but when i use the full dataset it raise the error of
Found input variables with inconsistent numbers of samples: [7, 3520]
the error happen when i try to oversampling the data using smote after transforming the data using countvectorizer.
This is the code where the error raise
# N-gram Feature and Term Frequency
vectorizer = CountVectorizer(ngram_range=(1,3))
x_train_tf = vectorizer.fit_transform(str(x_train).split('\n')).toarray()
x_test_tf = vectorizer.transform(str(x_test).split('\n')).toarray()
df_output = pd.DataFrame(data =x_train_tf, columns = vectorizer.get_feature_names_out())
display(df_output)
# the print shape is (7 rows × 250 columns)
smote = SMOTE(random_state=42, k_neighbors=5)
x_smote, y_smote = smote.fit_resample(x_train_tf, y_train)
print("Total Train Data SMOTE : ",x_smote.shape), print("Total Train Label SMOTE : ",y_smote)
i did not understand why this is happening so some explanation could really help.
i already tried to solve it using answers from other similiar question but nothing have worked.
This is the full code
import nltk
import re
#nltk.download()
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk import everygrams
from collections import Counter
from sklearn import preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
dataset = pd.read_csv("G:/TA/Program/dataset/Twitter_Emotion_Dataset.csv", encoding='latin-1')
# Preprocessing
dataset['case_folding_tweet'] = dataset['tweet'].str.casefold()
dataset['only_alphabet_tweet'] = [re.sub('[^a-zA-Z]+\s*', ' ', s) for s in dataset['case_folding_tweet']]
dataset['data_cleaning_tweet'] = dataset['only_alphabet_tweet'].str.replace(r'\b\w{1}\b','').str.replace(r'\s+', ' ')
slangword_dictionary = ("G:/TA/Program/dataset/kamus_singkatan.csv")
deslang = {}
list_slangword = open(slangword_dictionary).readlines()
for line in list_slangword:
slang, unslang = line.strip().split(';')
deslang[slang] = unslang
deslang[slang] = {r"\b{}\b".format(k): v for k, v in deslang.items()}
dataset['data_cleaning_tweet'] = dataset['data_cleaning_tweet'].replace(deslang[slang], regex=True)
dataset['convert_slang_tweet'] = dataset['data_cleaning_tweet']
replace_dictionary = {'tidak ': 'tidak', 'bukan ': 'bukan', 'jangan ': 'jangan', 'belum ': 'belum'}
dataset['convert_negation_tweet'] = dataset['convert_slang_tweet'].replace(replace_dictionary, regex=True)
dataset['tokenization_tweet'] = dataset['convert_negation_tweet'].apply(word_tokenize)
list_stopwords = set(stopwords.words("indonesian"))
list_stopwords.add('username')
list_stopwords.add('url')
dataset['stopword_removal_tweet'] = dataset['tokenization_tweet'].apply(lambda x: [item for item in x if item not in list_stopwords])
factory = StemmerFactory()
stemmer = factory.create_stemmer()
dataset['stemmed_tweet'] = dataset['stopword_removal_tweet'].apply(lambda x: [stemmer.stem(y) for y in x])
# Split data
x = dataset["stemmed_tweet"].values
y = dataset["label"].values
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state= 42)
# Get N-gram and TF
vectorizer = CountVectorizer(ngram_range=(1,3))
x_train_tf = vectorizer.fit_transform(str(x_train).split('\n')).toarray()
x_test_tf = vectorizer.transform(str(x_test).split('\n')).toarray()
# Oversampling
smote = SMOTE(random_state=42, k_neighbors=5)
x_smote, y_smote = smote.fit_resample(x_train_tf, y_train)
print("Total Train Data SMOTE : ",x_smote.shape), print("Total Train Label SMOTE : ",y_smote)
gnb_classifier = GaussianNB()
gnb_classifier.fit(x_smote, y_smote)
print(gnb_classifier)
y_pred = gnb_classifier.predict(x_test_tf)
print("Emotion Predicted :", y_pred)
Link to the dataset
I cannot solve it precisely because I don't have your data, but here are a few observations which should help:
apparently x_train_tf has only 7 rows? it's not enough for training a model and it's not 80% of 4401, as you're supposed to obtain from train_test_split.
Note that y_train has 3520 rows = 4401 * 80%, the correct number of rows.
I suspect that the line x_train_tf = vectorizer.fit_transform(str(x_train).split('\n')).toarray() is not doing what you think it does. Try to decompose the str(x_train).split('\n') part.
i fix the problem using the answer from this post answer
by joining all the train data column before vectorizing.
df_train = pd.DataFrame(data=x_train)
df_test = pd.DataFrame(data=x_test)
series = pd.Series(df_train['stemmed_tweet'])
corpus = series.apply(lambda series: ' '.join(series))
vectorizer = CountVectorizer(ngram_range=(1,3), lowercase=False)
x_train_tf = vectorizer.fit_transform(corpus).toarray()
x_test_tf = vectorizer.transform(str(df_test.values).split("\n")).toarray()

Workflow of NLP

When should I perform preprocessing and matrix creation of text data in NLP, before or after train_test_split? Below is my sample code where I have done preprocessing and matrix creation (tfidf) before train_test_split. I want to know will there be data leakage?
corpus = []
for i in range(0 ,len(data1)):
review = re.sub('[^a-zA-Z]', ' ', data1['features'][i])
review = review.lower()
review = review.split()
review = [stemmer.stem(j) for j in review if not j in set(stopwords.words('english'))]
review = ' '.join(review)
corpus.append(review)
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features = 6000)
x = cv.fit_transform(corpus).toarray()
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(data1['label'])
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state = 69,
stratify = y)
spam_model = MultinomialNB().fit(train_x, train_y)
pred = spam_model.predict(test_x)
c_matrix = confusion_matrix(test_y, pred)
acc_score = accuracy_score(test_y, pred)
As mentioned in the official documentation TfidfVectorizer class with max_features argument keeps only k-best features.
max_featuresint, default=None
If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.
If you present the class with the test set it would help to select this feature more efficiently and this is the data leakage (This scenario is based on your question but in most of the cases, it can be seen!) .
The safest way in machine learning is to ignore the test set until prediction/evaluation, think of it just like doesn't exist!
[UPDATED]
You can see an example from kaggle which uses vectorizer on pre-split datasets here!
More on this concept mentioned here and here!

ValueError: too many values to unpack (NLTK classifier)

I'm doing classification analysis using NLTK's Naive Bayes classifier. I insert a tsv file containing records and labels.
But the file doesn't get trained due to an error. Here's my python code
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('tweets.txt', delimiter ='\t', quoting = 3)
dataset.isnull().any()
dataset = dataset.fillna(method='ffill')
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0,16004):
tweet = re.sub('[^a-zA-Z]', ' ', dataset['tweet'][i])
tweet = tweet.lower()
tweet = tweet.split()
ps = PorterStemmer()
tweet = [ps.stem(word) for word in tweet if not word in
set(stopwords.words('english'))]
tweet = ' '.join(tweet)
corpus.append(tweet)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 10000)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20,
random_state = 0)
train_set, test_set = X_train[500:], y_train[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
The error is:
File "C:\Users\HSR\Anaconda2\lib\site-packages\nltk\classify\naivebayes.py", line 194, in train
for featureset, label in labeled_featuresets:
ValueError: too many values to unpack
NLTKClassifier doesn't work like scikit estimators. It requires the X and y both in a single array which is then passed to train().
But in your code, you are only supplying it the X_train and it tries to unpack y from that and hence the error.
The NaiveBayesClassifier requires the input to be a list of tuples where list denotes the training samples and the tuple has the feature dictionary and label inside. Something like:
X = [({feature1:'val11', feature2:'val12' .... }, class1),
({feature1:'val21', feature2:'val22' .... }, class2),
...
... ]
You need to change your input to this format.
feature_names = cv.get_feature_names()
train_set = []
for i, single_sample in enumerate(X):
single_feature_dict = {}
for j, single_feature in enumerate(single_sample):
single_feature_dict[feature_names[j]]=single_feature
train_set.append((single_feature_dict, y[i]))
Note: The above for loop can be shortened by using dict comprehension but I'm not that fluent there.
Then you can do this:
nltk.NaiveBayesClassifier.train(train_set)

How can I use word2vec to train a classifier?

The code is used to generate word2vec and use it to train the naive Bayes classifier.
I am able to generate word2vec and use the similarity functions successfully.As a next step I would want to use the word2vec to train the naive bayes classifier. Currently the code given an error when I am trying to slit the data in test and training. How do i convert word2vec model into the array so that it can be used as training data.
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import gensim
# Importing the dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
# Cleaning the texts
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1000):
review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
review = review.lower()
review = review.split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
# for word2vec we want an array of vectors
corpus.append(review)
#print(corpus)
X = gensim.models.Word2Vec(corpus, min_count=1,size=1000)
#print (X.most_similar("love"))
#embedding_matrix = np.zeros(len(X.wv.vocab), dtype='float32')
#for i in range(len(X.wv.vocab)):
# embedding_vector = X.wv[X.wv.index2word[i]]
# if embedding_vector is not None:
# embedding_matrix[i] = embedding_vector
# Creating the Bag of Words model
#from sklearn.feature_extraction.text import CountVectorizer
#cv = CountVectorizer(max_features = 1500)
#X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
It gives an error on line -
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
TypeError: Expected sequence or array-like, got <class 'gensim.models.word2vec.Word2Vec'>
Word2Vec provides word embeddings only. If you want to characterize documents by embeddings, you'll need to perform an averaging/summing/max operation on embeddings of all words from each document to have a D-dimensional vector that can be used for classification. See here and there for further information on this.
Otherwise, you can use Doc2Vec model to directly produce document embeddings, for which gensim also gives a very good provider.
You have vectors for each word, now you have two approaches to move forward, one could be simply take average of all the words in a sentence to find the sentence vector, another could be to use tfidf.
I implemented the average approach in one of my ongoing projects and i am sharing the github link, please go to the heading "text vectorization(word2vec)" and you will find the code their.
https://github.com/abhibhargav29/SentimentAnalysis/blob/master/SentimentAnalysis.ipynb. I would however suggest you to read data cleaning before as well to understand it in a better way.
One important advice: Do not split the data into train, cv, test after vectorization, do it before vectorization or you will overfit the model.

Text[Multi-Level] Classification with many outputs

Problem Statement:
To classify a text document to which category it belongs and also to classify up to two levels of the category.
Sample Training Set:
Description Category Level1 Level2
The gun shooting that happened in Vegas killed two Crime | High Crime High
Donald Trump elected as President of America Politics | High Politics High
Rian won in football qualifier Sports | Low Sports Low
Brazil won in football final Sports | High Sports High
Initial Attempt:
I tried to create a classifier model which would try to classify the Category using Random forest method and it gave me 90% overall.
Code1:
import pandas as pd
#import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
#from stemming.porter2 import stem
from nltk.corpus import stopwords
from sklearn.model_selection import cross_val_score
stop = stopwords.words('english')
data_file = "Training_dataset_70k"
#Reading the input/ dataset
data = pd.read_csv( data_file, header = 0, delimiter= "\t", quoting = 3, encoding = "utf8")
data = data.dropna()
#Removing stopwords, punctuation and stemming
data['Description'] = data['Description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
data['Description'] = data['Description'].str.replace('[^\w\s]',' ').replace('\s+',' ')
#data['Description'] = data['Description'].apply(lambda x: ' '.join([stem(word) for word in x.split()]))
train_data, test_data, train_label, test_label = train_test_split(data.Description, data.Category, test_size=0.3, random_state=100)
RF = RandomForestClassifier(n_estimators=10)
vectorizer = TfidfVectorizer( max_features = 40000, ngram_range = ( 1,3 ), sublinear_tf = True )
data_features = vectorizer.fit_transform( train_data )
RF.fit(data_features, train_label)
test_data_feature = vectorizer.transform(test_data)
Output_predict = RF.predict(test_data_feature)
print "Overall_Accuracy: " + str(np.mean(Output_predict == test_label))
with codecs.open("out_Category.txt", "w", "utf8") as out:
for inp, pred, act in zip(test_data, Output_predict, test_label):
try:
out.write("{}\t{}\t{}\n".format(inp, pred, act))
except:
continue
Problem:
I want to add two more level to the model they are Level1 and Level2 the reasons for adding them is when I ran classification for Level1 alone I got 96% accuracy. I am stuck at splitting training and test dataset and to train a model which has three classifications.
Is it possible to create a model with three classification or should I create three models? How to split train and test data?
Edit1:
import string
import codecs
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from stemming.porter2 import stem
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import cross_val_score
stop = stopwords.words('english')
data_file = "Training_dataset_70k"
#Reading the input/ dataset
data = pd.read_csv( data_file, header = 0, delimiter= "\t", quoting = 3, encoding = "utf8")
data = data.dropna()
#Removing stopwords, punctuation and stemming
data['Description'] = data['Description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
data['Description'] = data['Description'].str.replace('[^\w\s]',' ').replace('\s+',' ')
train_data, test_data, train_label, test_label = train_test_split(data.Description, data[["Category", "Level1", "Level2"]], test_size=0.3, random_state=100)
RF = RandomForestClassifier(n_estimators=2)
vectorizer = TfidfVectorizer( max_features = 40000, ngram_range = ( 1,3 ), sublinear_tf = True )
data_features = vectorizer.fit_transform( train_data )
print len(train_data), len(train_label)
print train_label
RF.fit(data_features, train_label)
test_data_feature = vectorizer.transform(test_data)
#print test_data_feature
Output_predict = RF.predict(test_data_feature)
print "BreadCrumb_Accuracy: " + str(np.mean(Output_predict == test_label))
with codecs.open("out_bread_crumb.txt", "w", "utf8") as out:
for inp, pred, act in zip(test_data, Output_predict, test_label):
try:
out.write("{}\t{}\t{}\n".format(inp, pred, act))
except:
continue
The scikit-learn Random Forest Classifier natively supports multiple outputs (see this example). Therefore, you do not need to create three separate models.
From the documentation of RandomForestClassifier.fit, the inputs to fit functions are:
X : array-like or sparse matrix of shape = [n_samples, n_features]
y : array-like, shape = [n_samples] or [n_samples, n_outputs]
Therefore, you need an array y (your labels) of size N x 3 as your input to your RandomForestClassifier. In order to split your training and test set, you can do:
train_data, test_data, train_label, test_label = train_test_split(data.Description, data[['Category','Level 1','Level 2']], test_size=0.3, random_state=100)
Your train_label and test_label should be arrays of size N x 3 that you can use to fit your model compare your predictions (NB: I have not tested it here, you might need to do some transposes).

Categories

Resources