Need some assistance, I am kind of stuck at the concept of implementing pipelines using sklearn. This dataset is KC Housing Dataset from Kaggle. I am trying to build a simple Linear regression , using Pipelines. However, I am missing something quite basic from the concept as I am unable to get past the error pasted at the bottom of this post. Please advise, its really appreciated. This is the complete code for this, feel free to mend the code where necessary.
**ERROR:**
Traceback (most recent call last):
File "/media/JBook/Software/PythonProjects/KCH/OneFi.py", line 123, in <module>
main()
File "/media/JBook/Software/PythonProjects/KCH/OneFi.py", line 118, in main
predictions_some_data = lin_reg.predict(some_data_prepared)
File "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/base.py", line 256, in predict
return self._decision_function(X)
File "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/base.py", line 241, in _decision_function
dense_output=True) + self.intercept_
File "/usr/local/lib/python3.5/dist-packages/sklearn/utils/extmath.py", line 135, in safe_sparse_dot
ret = a * b
File "/usr/local/lib/python3.5/dist-packages/scipy/sparse/base.py", line 387, in __mul__
raise ValueError('dimension mismatch')
ValueError: dimension mismatch
PS: Problem I am facing in this is at the almost end of this code
"predictions_some_data = lin_reg.predict(some_data_prepared)"
import pandas as pd
import os, sys
import numpy as np
import matplotlib.pyplot as plt
import data_visualize
from sklearn.model_selection import StratifiedShuffleSplit
import dataPrep
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
**## Loading the data**
KC_housing_path = "/media/JBook/Software/PythonProjects/KCH/datasets"
def load_housing_data(housing_path=KC_housing_path):
'''if not os.path.isfile("datasets/kc_house_data.csv"):
print("Check file location, program exiting..")
else:
csv_path = os.path.join(housing_path, "kc_house_data.csv")
print("reading csv file ...")
return pd.read_csv(csv_path)'''
try:
csv_path = os.path.join(housing_path, "kc_house_data.csv")
print("reading csv file -->")
return pd.read_csv(csv_path)
except FileNotFoundError:
print("Check file location, program exiting ...")
sys.exit()
**### Defining 2 classes for custom transformers**
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attributeNames):
self.attributes = attributeNames
# print('\n In constructor', self.attributes)
def fit(self, X, y=None):
# print("__DF Fit Method:\n", (X[self.attributes].values).shape)
return self
def transform(self, X):
# print("__Transform Method:\n", (X[self.attributes].values).shape)
return X[self.attributes].values
class LabelBinarizerPipelineFriendly(LabelBinarizer):
def fit(self, X, y=None):
# print("LB-->X.shape", X.shape)
"""this would allow us to fit the model based on the X input."""
super(LabelBinarizerPipelineFriendly, self).fit(X)
def transform(self, X, y=None):
# print("LB-Transform-X.shape", X.shape)
return super(LabelBinarizerPipelineFriendly, self).transform(X)
def fit_transform(self, X, y=None):
# print("LB-FIT_TRANSFORM-X.Shape", X.shape)
return super(LabelBinarizerPipelineFriendly, self).fit(X).transform(X)
def main():
# Loading House data
housing = load_housing_data()
housing_labels = housing['price']
# Removing not needed features & label ( price)
rem_attributes = ['id', 'date', 'price']
housing_col_removed = housing.drop(rem_attributes, axis=1, inplace=False)
**### Splitting the data**
train_set, test_set = train_test_split(housing_col_removed, test_size=0.3, random_state=42)
**#### Pipleline for numeric & categorical attribute transformations
#### Adding median to missing values & making one hot vectors of categorical attributes**
data_numeric = housing_col_removed.drop('ocean_proximity', axis=1, inplace=False)
numeric_attrib = list(data_numeric)
cat_attrib = ['ocean_proximity']
num_pipeline = Pipeline([
('selector', DataFrameSelector(numeric_attrib)),
('imputing', Imputer(missing_values=0, strategy='median')),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attrib)),
('label_Bin', LabelBinarizerPipelineFriendly(sparse_output=True)),
])
full_pipeline = FeatureUnion(transformer_list=[
('num_pipeline', num_pipeline),
('cat_pipeline', cat_pipeline),
])
**#### Fitting the linear regression model**
# print('This housing data is passed to prepare\n', housing_col_removed.head())
housing_prepared = dataPrep.prepData(housing_col_removed)
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
print('Housing Prepared Shape: \n', housing_prepared.shape)
print('\t\t\t\t\************* Predictions from Linear Regression Are ***********\n',lin_reg.predict(housing_prepared))
**### Below section is trying to use some data (5 rows ) from whole data set to predict values**
some_data = housing_col_removed[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.fit_transform(some_data)
print('\t\t\tSome Data Prepared is\n', some_data_prepared)
predictions_some_data = lin_reg.predict(some_data_prepared)
print('\t\t\t\t\************* Predictions from Linear Regression Are ***********\n', predictions_some_data)
# print('\t\t\t\t\************* Labels Are ***********\n', list(some_labels))
main()
Related
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.metrics import fbeta_score, make_scorer
import keras.backend as K
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
import pandas as pd
class CustomThreshold(BaseEstimator, ClassifierMixin):
""" Custom threshold wrapper for binary classification"""
def __init__(self, base, threshold=0.5):
self.base = base
self.threshold = threshold
def fit(self, *args, **kwargs):
self.base.fit(*args, **kwargs)
return self
def predict(self, X):
return (self.base.predict_proba(X)[:, 1] > self.threshold).astype(int)
dataset_clinical = np.genfromtxt("/content/drive/MyDrive/Colab Notebooks/BreastCancer-master/Data/stacked_metadata.csv",delimiter=",")
X = dataset_clinical[:,0:450]
Y = dataset_clinical[:,450]
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=1)
rf = RandomForestClassifier(n_estimators=10).fit(X,Y)
clf = [CustomThreshold(rf, threshold) for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]]
for model in clf:
print(confusion_matrix(y_test, model.predict(X_test)))
for model in clf:
print(confusion_matrix(Y, model.predict(X)))
*The traceback displays the following:
Traceback (most recent call last):
File "RF.py", line 33, in
rf = RandomForestClassifier(n_estimators=10).fit(X,Y)
File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 328, in fit
X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
File "/usr/local/lib/python3.7/dist-packages/sklearn/base.py", line 576, in _validate_data
X, y = check_X_y(X, y, **check_params)
File "/usr/local/lib/python3.7/dist-packages/sklearn/utils/validation.py", line 968, in check_X_y estimator=estimator,
File "/usr/local/lib/python3.7/dist-packages/sklearn/utils/validation.py", line 792, in check_array_assert_all_finite(array, allow_nan=force_all_finite == "allow-nan")
File "/usr/local/lib/python3.7/dist-packages/sklearn/utils/validation.py", line 116, in _assert_all_finite type_err, msg_dtype if msg_dtype is not None else X.dtype
ValueError: Input contains NaN, infinity or a value too large for dtype('float32').
*
At first glance I would say check your dataset for missing values, outliers etc.
A big part of any ML model is data exploration and preprocessing. I found a guide for that, for beginners. Pandas: https://towardsdatascience.com/data-visualization-exploration-using-pandas-only-beginner-a0a52eb723d5
I am trying to convert / store a sklearn SVC model as a .onnx file and I am getting a runtime error I do not understand. I have been able to use this same code effectively without error with a sklearn random forest classifier and a sklearn k-NN classifier. I am getting a strange onnx runtime error that I do not understand. Any help with this error is appreciated.
Below I first posted the output of running my file svm_time.py and then below that included the code that is included in the svm_time.py file.
Thanks.
python3 svm_time.py
'train_model' 4809.58 ms
train score is: 0.8765468473777254
val Accuracy is: 0.7037037037037037
Traceback (most recent call last):
File "svm_time.py", line 97, in <module>
main()
File "svm_time.py", line 91, in main
onx = convert_sklearn(clf, initial_types=initial_type)
File "/home/matt/anaconda3/envs/venv/lib/python3.7/site-packages/skl2onnx/convert.py", line 154, in convert_sklearn
dtype=dtype, options=options)
File "/home/matt/anaconda3/envs/venv/lib/python3.7/site-packages/skl2onnx/common/_topology.py", line 1054, in convert_topology
conv(scope, operator, container)
File "/home/matt/anaconda3/envs/venv/lib/python3.7/site-packages/skl2onnx/common/_registration.py", line 29, in __call__
return self._fct(*args)
File "/home/matt/anaconda3/envs/venv/lib/python3.7/site-packages/skl2onnx/operator_converters/support_vector_machines.py", line 221, in convert_sklearn_svm_classifier
"Classes different from first n integers are not supported "
RuntimeError: Classes different from first n integers are not supported in SVC converter.
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import train_test_split
import time
import math
import numpy as np
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
#https://medium.com/pythonhive/python-decorator-to-measure-the-execution-time-of-methods-fa04cb6bb36d
def timeit(method):
def timed(*args, **kw):
ts = time.time()
result = method(*args, **kw)
te = time.time()
if 'log_time' in kw:
name = kw.get('log_name', method.__name__.upper())
kw['log_time'][name] = int((te - ts) * 1000)
else:
print('%r %2.2f ms' %
(method.__name__, (te - ts) * 1000))
return result
return timed
clf = SVC(kernel='rbf', gamma=0.001, C=10)
#timeit
def train_model(in_data,in_labels):
clf.fit(in_data,in_labels)
def main():
data = pd.read_csv('fall_data.csv', header=None)
labels = pd.read_csv('fall_labels.csv', header=None)
data = data.to_numpy()
data_labels = labels[0]
train_set, test_set, train_label, test_label = train_test_split(
data, data_labels, test_size=0.1, random_state=42)
train_set2, val_set, train_label2, val_label = train_test_split(
train_set, train_label, test_size=0.1, random_state=42)
scaler = StandardScaler().fit(train_set2)
X_train = scaler.transform(train_set2)
X_val = scaler.transform(val_set)
train_model(X_train,train_label2)
tpred = clf.predict(X_train)
ts = accuracy_score(train_label2, tpred)
print('train score is: ', ts)
pred = clf.predict(X_val)
s = accuracy_score(val_label, pred)
print('val Accuracy is: ', s)
initial_type = [('float_input', FloatTensorType([None, 453]))]
onx = convert_sklearn(clf, initial_types=initial_type)
with open("svmrbf_unimib_f8.onnx", "wb") as f:
f.write(onx.SerializeToString())
if __name__ == '__main__':
main()
It seems to me that this may be a compatibility issue with Onnx and sklearn.
1). https://github.com/onnx/sklearn-onnx/issues/302
2). https://github.com/onnx/sklearn-onnx/blob/master/skl2onnx/operator_converters/support_vector_machines.py#L17
Based on these two sources I changed my code to include the OVO decision function shape instead of OVR and now, at-least when I run my svm_time.py file an .onnx file is saved.
clf = SVC(kernel='rbf', gamma=0.001, C=10, decision_function_shape='ovo')
I have been trying to get the feature names on my model for quite some time now but have a hard time understanding how to do it. I have tried many posts on here but can't get it to work. Here is my code:
loading the classes I need to combine tfidfvectorizer with other features
from sklearn.base import TransformerMixin, BaseEstimator
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class FeatureTypeSelector(TransformerMixin, BaseEstimator):
FEATURE_TYPES = {
'categorical': [
'COLUMN_A','COLUMN_B'
],
'continuous': [
'COLULMN_C','COLUMN_D'
]
}
def __init__(self, feature_type):
self.columns = self.FEATURE_TYPES[feature_type]
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.columns]
class RowToDictTransformer(TransformerMixin, BaseEstimator):
def fit(self, X, y=None):
return self
def transform(self, X):
return (row[1] for row in X.iterrows())
Then the code to put everything in a pipeline and run the regressor
from sklearn.pipeline import make_union, make_pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
# Create the preprocessor
preprocessor = make_union(
make_pipeline(
ItemSelector(key='TEXT_COLUMN'),
TfidfVectorizer(lowercase=False, min_df=1),
),
make_pipeline(
FeatureTypeSelector('continuous'),
MinMaxScaler(),
),
make_pipeline(
FeatureTypeSelector('categorical'),
RowToDictTransformer(),
DictVectorizer(sparse=False), # set sparse=True if you get MemoryError
),
)
# fit and transform the data
preprocessor.fit_transform(x_train)
# choose some estimator
# estimator = MultinomialNB()
estimator = LinearRegression()
# Create the model
model = make_pipeline(preprocessor, estimator)
# Training the model
model.fit(x_train, y_train)
# Predicting the model
predicted = model.predict(x_test)
I can run the model.coef_ to get all the coefficients but I want to see how each item of the TEXT_COLUMN is affected by which weight. I have tried calling get_feature_names() or tried passing them in the pipeline but with no succes (most of google's results are purple by now).
Anyone that can give me a bit of guidance how to pass the feature names to the end of the pipeline? The ideal result would be a dataframe with the feature (row from the TEXT_COLUMN) and feature_weight as value.
I am trying to build a sentiment analyzer using scikit-learn/pandas. Building and evaluating the model works, but attempting to classify new sample text does not.
My code:
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
infile = 'Sentiment_Analysis_Dataset.csv'
data = "SentimentText"
labels = "Sentiment"
class Classifier():
def __init__(self):
self.train_set, self.test_set = self.load_data()
self.counts, self.test_counts = self.vectorize()
self.classifier = self.train_model()
def load_data(self):
df = pd.read_csv(infile, header=0, error_bad_lines=False)
train_set, test_set = train_test_split(df, test_size=.3)
return train_set, test_set
def train_model(self):
classifier = BernoulliNB()
targets = self.train_set[labels]
classifier.fit(self.counts, targets)
return classifier
def vectorize(self):
vectorizer = TfidfVectorizer(min_df=5,
max_df = 0.8,
sublinear_tf=True,
ngram_range = (1,2),
use_idf=True)
counts = vectorizer.fit_transform(self.train_set[data])
test_counts = vectorizer.transform(self.test_set[data])
return counts, test_counts
def evaluate(self):
test_counts,test_set = self.test_counts, self.test_set
predictions = self.classifier.predict(test_counts)
print (classification_report(test_set[labels], predictions))
print ("The accuracy score is {:.2%}".format(accuracy_score(test_set[labels], predictions)))
def classify(self, input):
input_text = input
input_vectorizer = TfidfVectorizer(min_df=5,
max_df = 0.8,
sublinear_tf=True,
ngram_range = (1,2),
use_idf=True)
input_counts = input_vectorizer.transform(input_text)
predictions = self.classifier.predict(input_counts)
print(predictions)
myModel = Classifier()
text = ['I like this I feel good about it', 'give me 5 dollars']
myModel.classify(text)
myModel.evaluate()
The error:
Traceback (most recent call last):
File "sentiment.py", line 74, in <module>
myModel.classify(text)
File "sentiment.py", line 66, in classify
input_counts = input_vectorizer.transform(input_text)
File "/home/rachel/Sentiment/ENV/lib/python3.5/site-packages/sklearn/feature_extraction/text.py", line 1380, in transform
X = super(TfidfVectorizer, self).transform(raw_documents)
File "/home/rachel/Sentiment/ENV/lib/python3.5/site-packages/sklearn/feature_extraction/text.py", line 890, in transform
self._check_vocabulary()
File "/home/rachel/Sentiment/ENV/lib/python3.5/site-packages/sklearn/feature_extraction/text.py", line 278, in _check_vocabulary
check_is_fitted(self, 'vocabulary_', msg=msg),
File "/home/rachel/Sentiment/ENV/lib/python3.5/site-packages/sklearn/utils/validation.py", line 690, in check_is_fitted
raise _NotFittedError(msg % {'name': type(estimator).__name__})
sklearn.exceptions.NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted.
I'm not sure what the issue could be. In my classify method, I create a brand new vectorizer to process the text I want to classify, separate from the vectorizer used to create training and test data from the model.
Thanks
You've fitted a vectorizer, but you throw it away because it doesn't exist past the lifetime of your vectorize function. Instead, save your model in vectorize after it's been transformed:
self._vectorizer = vectorizer
Then in your classify function, don't create a new vectorizer. Instead, use the one you'd fitted to the training data:
input_counts = self._vectorizer.transform(input_text)
Save vectorizer as a pickle or joblib file and load it when you want to predict.
pickle.dump(vectorizer, open("vectorizer.pickle", "wb")) //Save vectorizer
pickle.load(open("vectorizer.pickle", 'rb')) // Load vectorizer
You can save both the model and the vectorizer and use them later on as well: here is how I did it:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
import pickle
# Train the classification model
def train_model():
df = pd.read_json('intent_data.json')
X_train, X_test, y_train, y_test = train_test_split(df['Utterance'], df['Intent'], random_state=0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
model = LinearSVC().fit(X_train_tfidf, y_train)
# Save the vectorizer
vec_file = 'vectorizer.pickle'
pickle.dump(count_vect, open(vec_file, 'wb'))
# Save the model
mod_file = 'classification.model'
pickle.dump(model, open(mod_file, 'wb'))
# Load the classification model from disk and use for predictions
def classify_utterance(utt):
# load the vectorizer
loaded_vectorizer = pickle.load(open('vectorizer.pickle', 'rb'))
# load the model
loaded_model = pickle.load(open('classification.model', 'rb'))
# make a prediction
print(loaded_model.predict(loaded_vectorizer.transform([utt])))
So I've been working on this chatbot project, I'm using SVM for its ML and I really want to use cosine similarity as kernel. i've tried using pykernel (as suggested from this post) or another code from different source, but it's still not working, and I don't know why...
say that i have train.py code like this
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle, csv, json, timeit, random, os, nltk
from nltk.stem.lancaster import LancasterStemmer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import LabelEncoder as LE
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import my_kernel
def preprocessing(text):
factory1 = StopWordRemoverFactory()
StopWord = factory1.create_stop_word_remover()
text = StopWord.remove(text)
factory2 = StemmerFactory()
stemmer = factory2.create_stemmer()
return (stemmer.stem(text))
le = LE()
tfv = TfidfVectorizer(min_df=1)
file = os.path.join(os.path.dirname(os.path.abspath(__file__)),"scraping","tes.json")
svm_pickle_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"data","svm_model.pickle")
if os.path.exists(svm_pickle_path):
os.remove(svm_pickle_path)
tit = [] # Title
cat = [] # Category
post = [] # Post
with open(file, "r") as sentences_file:
reader = json.load(sentences_file)
for row in reader:
tit.append(preprocessing(row["Judul"]))
cat.append(preprocessing(row["Kategori"]))
post.append(preprocessing(row["Post"]))
tfv.fit(tit)
le.fit(cat)
features = tfv.transform(tit)
labels = le.transform(cat)
trainx, testx, trainy, testy = tts(features, labels, test_size=.30, random_state=42)
model = SVC(kernel=my_kernel, C=1.5)
f = open(svm_pickle_path, 'wb')
pickle.dump(model.fit(trainx, trainy), f)
f.close()
print("SVC training score:", model.score(testx, testy))
with open(svm_pickle_path, 'rb') as file:
pickle_model = pickle.load(file)
score = pickle_model.score(testx, testy)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_model.predict(testx)
print(Ypredict)
and for my_kernel.py code :
import numpy as np
import math
from numpy import linalg as LA
def my_kernel(X, Y):
norm = LA.norm(X) * LA.norm(Y)
return np.dot(X, Y.T)/norm
and it shows this everytime I run the program
Traceback (most recent call last):
File "F:\env\chatbot\chatbotProj\chatbotProj\train.py", line 84, in <module>
pickle.dump(model.fit(trainx, trainy), f)
File "F:\env\lib\site-packages\sklearn\svm\base.py", line 212, in fit
fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
File "F:\env\lib\site-packages\sklearn\svm\base.py", line 252, in _dense_fit
X = self._compute_kernel(X)
File "F:\env\lib\site-packages\sklearn\svm\base.py", line 380, in _compute_kernel
kernel = self.kernel(X, self.__Xfit)
File "F:\env\chatbot\chatbotProj\chatbotProj\ChatbotCode\svm.py", line 31, in my_kernel
norm = LA.norm(X) * LA.norm(Y)
File "F:\env\lib\site-packages\numpy\linalg\linalg.py", line 2359, in norm
sqnorm = dot(x, x)
File "F:\env\lib\site-packages\scipy\sparse\base.py", line 478, in __mul__
raise ValueError('dimension mismatch')
ValueError: dimension mismatch
I'm new to python and this SVM area, does anybody know what's wrong or could recommend me how to write cosine similarity kernel better and cleaner?
Oh and, the dimension for the train X is (193, 634), train Y is (193, ), test X is (83, 634) and test Y is (83,) from train_test_split sklearn.
Update :
my friend told me it's happened because I have sparse matrix not a simple array, so I have to dense it and replace my_kernel.py code to be like this
def my_kernel(X, Y):
X=np.array(X.todense())
Y=np.array(Y.todense())
norm = LA.norm(X) * LA.norm(Y)
return np.dot(X, Y.T)/norm