So I've been working on this chatbot project, I'm using SVM for its ML and I really want to use cosine similarity as kernel. i've tried using pykernel (as suggested from this post) or another code from different source, but it's still not working, and I don't know why...
say that i have train.py code like this
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle, csv, json, timeit, random, os, nltk
from nltk.stem.lancaster import LancasterStemmer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import LabelEncoder as LE
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import my_kernel
def preprocessing(text):
factory1 = StopWordRemoverFactory()
StopWord = factory1.create_stop_word_remover()
text = StopWord.remove(text)
factory2 = StemmerFactory()
stemmer = factory2.create_stemmer()
return (stemmer.stem(text))
le = LE()
tfv = TfidfVectorizer(min_df=1)
file = os.path.join(os.path.dirname(os.path.abspath(__file__)),"scraping","tes.json")
svm_pickle_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"data","svm_model.pickle")
if os.path.exists(svm_pickle_path):
os.remove(svm_pickle_path)
tit = [] # Title
cat = [] # Category
post = [] # Post
with open(file, "r") as sentences_file:
reader = json.load(sentences_file)
for row in reader:
tit.append(preprocessing(row["Judul"]))
cat.append(preprocessing(row["Kategori"]))
post.append(preprocessing(row["Post"]))
tfv.fit(tit)
le.fit(cat)
features = tfv.transform(tit)
labels = le.transform(cat)
trainx, testx, trainy, testy = tts(features, labels, test_size=.30, random_state=42)
model = SVC(kernel=my_kernel, C=1.5)
f = open(svm_pickle_path, 'wb')
pickle.dump(model.fit(trainx, trainy), f)
f.close()
print("SVC training score:", model.score(testx, testy))
with open(svm_pickle_path, 'rb') as file:
pickle_model = pickle.load(file)
score = pickle_model.score(testx, testy)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_model.predict(testx)
print(Ypredict)
and for my_kernel.py code :
import numpy as np
import math
from numpy import linalg as LA
def my_kernel(X, Y):
norm = LA.norm(X) * LA.norm(Y)
return np.dot(X, Y.T)/norm
and it shows this everytime I run the program
Traceback (most recent call last):
File "F:\env\chatbot\chatbotProj\chatbotProj\train.py", line 84, in <module>
pickle.dump(model.fit(trainx, trainy), f)
File "F:\env\lib\site-packages\sklearn\svm\base.py", line 212, in fit
fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
File "F:\env\lib\site-packages\sklearn\svm\base.py", line 252, in _dense_fit
X = self._compute_kernel(X)
File "F:\env\lib\site-packages\sklearn\svm\base.py", line 380, in _compute_kernel
kernel = self.kernel(X, self.__Xfit)
File "F:\env\chatbot\chatbotProj\chatbotProj\ChatbotCode\svm.py", line 31, in my_kernel
norm = LA.norm(X) * LA.norm(Y)
File "F:\env\lib\site-packages\numpy\linalg\linalg.py", line 2359, in norm
sqnorm = dot(x, x)
File "F:\env\lib\site-packages\scipy\sparse\base.py", line 478, in __mul__
raise ValueError('dimension mismatch')
ValueError: dimension mismatch
I'm new to python and this SVM area, does anybody know what's wrong or could recommend me how to write cosine similarity kernel better and cleaner?
Oh and, the dimension for the train X is (193, 634), train Y is (193, ), test X is (83, 634) and test Y is (83,) from train_test_split sklearn.
Update :
my friend told me it's happened because I have sparse matrix not a simple array, so I have to dense it and replace my_kernel.py code to be like this
def my_kernel(X, Y):
X=np.array(X.todense())
Y=np.array(Y.todense())
norm = LA.norm(X) * LA.norm(Y)
return np.dot(X, Y.T)/norm
Related
Hi i'm currently learning coding with python and have been following a tutorial series which has helped me make the code i will show below. Apologies for it being so long but I cannot pinpoint the line of code which is causing this error. I have removed a lot of the commenting to reduce the amount of code posted.
import numpy as np
import urllib.request
import os
import gzip
import lasagne
import theano
import theano.tensor as T
def load_dataset():
def download(filename, source="http://yann.lecun.com/exdb/mnist/"):
print("downloading:", filename)
urllib.request.urlretrieve(source+filename, filename)
def load_mnist_images(filename):
if not os.path.exists(filename):
download(filename)
with gzip.open(filename, "rb") as f:
data = np.frombuffer(f.read(), np.uint8, offset= 16)
data = data.reshape(-1, 1, 28, 28)
return data / np.float32(256)
def load_mnist_labels(filename):
if not os.path.exists(filename):
download(filename)
with gzip.open(filename, "rb") as f:
data = np.frombuffer(f.read(), np.uint8, offset= 8)
return data
x_train = load_mnist_images("train-images-idx3-ubyte.gz")
y_train = load_mnist_labels("train-labels-idx1-ubyte.gz")
x_test = load_mnist_images("t10k-images-idx3-ubyte.gz")
y_test = load_mnist_labels("t10k-labels-idx1-ubyte.gz")
return x_train, y_train, x_test, y_test
x_train, y_train, x_test, y_test = load_dataset()
###### creating the handwriting digit recognition code ######
def build_nn(input_var = None):
l_in = lasagne.layers.InputLayer(shape=(None,1,28,28), input_var=input_var)
l_in_drop = lasagne.layers.DropoutLayer(l_in, p=0.2)
l_hid1 = lasagne.layers.DenseLayer(l_in_drop, num_units= 800,
nonlinearity= lasagne.nonlinearities.rectify,
W= lasagne.init.GlorotUniform())
l_hid1_drop = lasagne.layers.DropoutLayer(l_hid1, p=0.5)
l_hid2 = lasagne.layers.DenseLayer(l_hid1_drop, num_units= 800,
nonlinearity= lasagne.nonlinearities.rectify,
W= lasagne.init.GlorotUniform())
l_hid2_drop = lasagne.layers.DropoutLayer(l_hid2, p=0.5)
l_out = lasagne.layers.DenseLayer(l_hid2_drop, num_units=10,
nonlinearity= lasagne.nonlinearities.softmax)
return l_out
input_var = T.tensor4("inputs") # an empty 4d array
target_var = T.ivector("targets") # an empty 1d int array to represent the labels
network = build_nn(input_var) # call the func that initializes the neural network
prediction = lasagne.layers.get_output(network)
loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
loss = loss.mean()
params = lasagne.layers.get_all_params(network, trainable=True)
updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9)
train_fn = theano.function([input_var, target_var], loss, updates= updates)
num_training_steps = 10
for step in range(num_training_steps):
train_err = train_fn(x_train, y_train)
print("current training step is " + str(step))
The error that's stopping this code is this:
Traceback (most recent call last):
File "C:\Users\Admin\.vscode\Practice codes\machine learning\deep learning\deep learning.py", line 125, in <module>
network = build_nn(input_var) # call the func that initializes the neural network
File "C:\Users\Admin\.vscode\Practice codes\machine learning\deep learning\deep learning.py", line 95, in build_nn
l_hid1 = lasagne.layers.DenseLayer(l_in_drop, num_units= 800,
File "C:\Users\Admin\AppData\Roaming\Python\Python38\site-packages\lasagne\layers\dense.py", line 103, in __init__
self.W = self.add_param(W, (num_inputs, num_units), name="W")
File "C:\Users\Admin\AppData\Roaming\Python\Python38\site-packages\lasagne\layers\base.py", line 234, in add_param
param = utils.create_param(spec, shape, name)
File "C:\Users\Admin\AppData\Roaming\Python\Python38\site-packages\lasagne\utils.py", line 393, in create_param
spec = theano.shared(spec, broadcastable=bcast)
File "C:\Users\Admin\AppData\Roaming\Python\Python38\site-packages\theano\compile\sharedvalue.py", line 284, in shared
raise TypeError('No suitable SharedVariable constructor could be found.'
TypeError: No suitable SharedVariable constructor could be found. Are you sure all kwargs are supported? We do not support the parameter dtype or type. value="[[ 0.04638761 -0.02959769 0.02330909 ... 0.01545383 0.04763002
0.05265676]
[ 0.02095251 -0.05393376 -0.04289599 ... -0.02409102 0.02824548
-0.00327342]
[ 0.02908951 -0.02853872 -0.05450716 ... -0.02296509 0.02495853
0.02486875]
...
[-0.03704383 0.0286258 0.01158947 ... -0.02583007 -0.04925423
-0.0470493 ]
[ 0.03230407 -0.00246115 -0.05074456 ... 0.00299953 0.01883504
0.01312843]
[-0.05762409 -0.05119916 -0.02820581 ... -0.05675326 0.00458562
0.04403118]]". parameters="{'broadcastable': (False, False)}"
If it helps I'm using python 3.8 - lasagne 0.2.dev1 - theano 1.0.5.
Any help would be greatly appreciated, any questions feel free to ask.
Thanks in advance
I am trying to convert / store a sklearn SVC model as a .onnx file and I am getting a runtime error I do not understand. I have been able to use this same code effectively without error with a sklearn random forest classifier and a sklearn k-NN classifier. I am getting a strange onnx runtime error that I do not understand. Any help with this error is appreciated.
Below I first posted the output of running my file svm_time.py and then below that included the code that is included in the svm_time.py file.
Thanks.
python3 svm_time.py
'train_model' 4809.58 ms
train score is: 0.8765468473777254
val Accuracy is: 0.7037037037037037
Traceback (most recent call last):
File "svm_time.py", line 97, in <module>
main()
File "svm_time.py", line 91, in main
onx = convert_sklearn(clf, initial_types=initial_type)
File "/home/matt/anaconda3/envs/venv/lib/python3.7/site-packages/skl2onnx/convert.py", line 154, in convert_sklearn
dtype=dtype, options=options)
File "/home/matt/anaconda3/envs/venv/lib/python3.7/site-packages/skl2onnx/common/_topology.py", line 1054, in convert_topology
conv(scope, operator, container)
File "/home/matt/anaconda3/envs/venv/lib/python3.7/site-packages/skl2onnx/common/_registration.py", line 29, in __call__
return self._fct(*args)
File "/home/matt/anaconda3/envs/venv/lib/python3.7/site-packages/skl2onnx/operator_converters/support_vector_machines.py", line 221, in convert_sklearn_svm_classifier
"Classes different from first n integers are not supported "
RuntimeError: Classes different from first n integers are not supported in SVC converter.
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import train_test_split
import time
import math
import numpy as np
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
#https://medium.com/pythonhive/python-decorator-to-measure-the-execution-time-of-methods-fa04cb6bb36d
def timeit(method):
def timed(*args, **kw):
ts = time.time()
result = method(*args, **kw)
te = time.time()
if 'log_time' in kw:
name = kw.get('log_name', method.__name__.upper())
kw['log_time'][name] = int((te - ts) * 1000)
else:
print('%r %2.2f ms' %
(method.__name__, (te - ts) * 1000))
return result
return timed
clf = SVC(kernel='rbf', gamma=0.001, C=10)
#timeit
def train_model(in_data,in_labels):
clf.fit(in_data,in_labels)
def main():
data = pd.read_csv('fall_data.csv', header=None)
labels = pd.read_csv('fall_labels.csv', header=None)
data = data.to_numpy()
data_labels = labels[0]
train_set, test_set, train_label, test_label = train_test_split(
data, data_labels, test_size=0.1, random_state=42)
train_set2, val_set, train_label2, val_label = train_test_split(
train_set, train_label, test_size=0.1, random_state=42)
scaler = StandardScaler().fit(train_set2)
X_train = scaler.transform(train_set2)
X_val = scaler.transform(val_set)
train_model(X_train,train_label2)
tpred = clf.predict(X_train)
ts = accuracy_score(train_label2, tpred)
print('train score is: ', ts)
pred = clf.predict(X_val)
s = accuracy_score(val_label, pred)
print('val Accuracy is: ', s)
initial_type = [('float_input', FloatTensorType([None, 453]))]
onx = convert_sklearn(clf, initial_types=initial_type)
with open("svmrbf_unimib_f8.onnx", "wb") as f:
f.write(onx.SerializeToString())
if __name__ == '__main__':
main()
It seems to me that this may be a compatibility issue with Onnx and sklearn.
1). https://github.com/onnx/sklearn-onnx/issues/302
2). https://github.com/onnx/sklearn-onnx/blob/master/skl2onnx/operator_converters/support_vector_machines.py#L17
Based on these two sources I changed my code to include the OVO decision function shape instead of OVR and now, at-least when I run my svm_time.py file an .onnx file is saved.
clf = SVC(kernel='rbf', gamma=0.001, C=10, decision_function_shape='ovo')
I'm trying to train a simple movie recommendation system using the latest-small movie lens dataset, but I keep getting an error saying that:
Traceback (most recent call last):
File "D:\AI\Python projects\anotherone.py", line 48, in <module>
history = model.fit([train.userId,train.movieId], train.rating,epochs=10, verbose=1)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py", line 780, in fit
steps_name='steps_per_epoch')
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\training_arrays.py", line 363, in model_iteration
batch_outs = f(ins_batch)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\backend.py", line 3292, in _call_
run_metadata=self.run_metadata)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1458, in _call_
run_metadata_ptr)
tensorflow.python.framework.errors_impl.InvalidArgumentError: indices[4,0] = 179819 is not in [0, 8984)
[[{{node Movie-Embedding/embedding_lookup}}]]
Code:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
# ignoring warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# importing our data
df = pd.read_csv('D:/AI/Data sets/ml-latest-small/ratings.csv')
train, test = train_test_split(df, test_size=0.2, random_state=42)
# extracting the unique number of users and movies
n_users = len(df.userId.unique())
n_movies = len(df.movieId.unique())
# creating the embedding
movie_input = Input(shape=[1], name="Movie-Input")
movie_embedding = Embedding(n_movies+1, 5, name="Movie-Embedding")(movie_input)
movie_vec = Flatten(name="Flatten-Movie")(movie_embedding)
user_input = Input(shape=[1], name = "User-Input")
user_embedding = Embedding(n_users+1, 5, name="User-Embedding")(user_input)
user_vec = Flatten(name="Flatten-user")(user_embedding)
# concatinating the features
concat = concatenate([movie_vec,user_vec])
# creating our model
layer1 = Dense(128,activation="relu")(concat)
layer2 = Dense(32,activation="relu")(layer1)
outputLayer = Dense(1)(layer2)
model = Model([user_input,movie_input],outputLayer)
model.compile('adam','mean_squared_error')
# tranning the model
if os.path.exists('multiParam.h5'):
model = load_model('multiParam.h5')
else:
history = model.fit([train.userId,train.movieId], train.rating,epochs=10, verbose=1)
model.save('multiParam.h5')
plt.plot(history.history['loss'])
plt.xlabel("Epochs")
plt.ylabel("Training Error")
# testing the model
print(model.evaluate([test.userId, test.movieId], test.rating))
# running some predictions
predictions = model.predict([test.userId.head(10), test.movieId.head(10)])
[print(predictions[i], test.rating.iloc[i]) for i in range(0,10)]
I'm still new to machine learning, but from the research I understood, I need to provide the number of unique values +1 to the embedding layer, which I'm doing but It's still not working, any help can be appreciated, thank you :)
You should try Label encoding for both movieId and userId in order to make them sequential starting from zero :
from sklearn.preprocessing import LabelEncoder
user_enc = LabelEncoder()
df['user'] = user_enc.fit_transform(df['userId'].values)
n_users = df['user'].nunique()
item_enc = LabelEncoder()
df['movie'] = item_enc.fit_transform(df['movieId'].values)
n_movies = df['movie'].nunique()
I am trying to build a sentiment analyzer using scikit-learn/pandas. Building and evaluating the model works, but attempting to classify new sample text does not.
My code:
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
infile = 'Sentiment_Analysis_Dataset.csv'
data = "SentimentText"
labels = "Sentiment"
class Classifier():
def __init__(self):
self.train_set, self.test_set = self.load_data()
self.counts, self.test_counts = self.vectorize()
self.classifier = self.train_model()
def load_data(self):
df = pd.read_csv(infile, header=0, error_bad_lines=False)
train_set, test_set = train_test_split(df, test_size=.3)
return train_set, test_set
def train_model(self):
classifier = BernoulliNB()
targets = self.train_set[labels]
classifier.fit(self.counts, targets)
return classifier
def vectorize(self):
vectorizer = TfidfVectorizer(min_df=5,
max_df = 0.8,
sublinear_tf=True,
ngram_range = (1,2),
use_idf=True)
counts = vectorizer.fit_transform(self.train_set[data])
test_counts = vectorizer.transform(self.test_set[data])
return counts, test_counts
def evaluate(self):
test_counts,test_set = self.test_counts, self.test_set
predictions = self.classifier.predict(test_counts)
print (classification_report(test_set[labels], predictions))
print ("The accuracy score is {:.2%}".format(accuracy_score(test_set[labels], predictions)))
def classify(self, input):
input_text = input
input_vectorizer = TfidfVectorizer(min_df=5,
max_df = 0.8,
sublinear_tf=True,
ngram_range = (1,2),
use_idf=True)
input_counts = input_vectorizer.transform(input_text)
predictions = self.classifier.predict(input_counts)
print(predictions)
myModel = Classifier()
text = ['I like this I feel good about it', 'give me 5 dollars']
myModel.classify(text)
myModel.evaluate()
The error:
Traceback (most recent call last):
File "sentiment.py", line 74, in <module>
myModel.classify(text)
File "sentiment.py", line 66, in classify
input_counts = input_vectorizer.transform(input_text)
File "/home/rachel/Sentiment/ENV/lib/python3.5/site-packages/sklearn/feature_extraction/text.py", line 1380, in transform
X = super(TfidfVectorizer, self).transform(raw_documents)
File "/home/rachel/Sentiment/ENV/lib/python3.5/site-packages/sklearn/feature_extraction/text.py", line 890, in transform
self._check_vocabulary()
File "/home/rachel/Sentiment/ENV/lib/python3.5/site-packages/sklearn/feature_extraction/text.py", line 278, in _check_vocabulary
check_is_fitted(self, 'vocabulary_', msg=msg),
File "/home/rachel/Sentiment/ENV/lib/python3.5/site-packages/sklearn/utils/validation.py", line 690, in check_is_fitted
raise _NotFittedError(msg % {'name': type(estimator).__name__})
sklearn.exceptions.NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted.
I'm not sure what the issue could be. In my classify method, I create a brand new vectorizer to process the text I want to classify, separate from the vectorizer used to create training and test data from the model.
Thanks
You've fitted a vectorizer, but you throw it away because it doesn't exist past the lifetime of your vectorize function. Instead, save your model in vectorize after it's been transformed:
self._vectorizer = vectorizer
Then in your classify function, don't create a new vectorizer. Instead, use the one you'd fitted to the training data:
input_counts = self._vectorizer.transform(input_text)
Save vectorizer as a pickle or joblib file and load it when you want to predict.
pickle.dump(vectorizer, open("vectorizer.pickle", "wb")) //Save vectorizer
pickle.load(open("vectorizer.pickle", 'rb')) // Load vectorizer
You can save both the model and the vectorizer and use them later on as well: here is how I did it:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
import pickle
# Train the classification model
def train_model():
df = pd.read_json('intent_data.json')
X_train, X_test, y_train, y_test = train_test_split(df['Utterance'], df['Intent'], random_state=0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
model = LinearSVC().fit(X_train_tfidf, y_train)
# Save the vectorizer
vec_file = 'vectorizer.pickle'
pickle.dump(count_vect, open(vec_file, 'wb'))
# Save the model
mod_file = 'classification.model'
pickle.dump(model, open(mod_file, 'wb'))
# Load the classification model from disk and use for predictions
def classify_utterance(utt):
# load the vectorizer
loaded_vectorizer = pickle.load(open('vectorizer.pickle', 'rb'))
# load the model
loaded_model = pickle.load(open('classification.model', 'rb'))
# make a prediction
print(loaded_model.predict(loaded_vectorizer.transform([utt])))
Need some assistance, I am kind of stuck at the concept of implementing pipelines using sklearn. This dataset is KC Housing Dataset from Kaggle. I am trying to build a simple Linear regression , using Pipelines. However, I am missing something quite basic from the concept as I am unable to get past the error pasted at the bottom of this post. Please advise, its really appreciated. This is the complete code for this, feel free to mend the code where necessary.
**ERROR:**
Traceback (most recent call last):
File "/media/JBook/Software/PythonProjects/KCH/OneFi.py", line 123, in <module>
main()
File "/media/JBook/Software/PythonProjects/KCH/OneFi.py", line 118, in main
predictions_some_data = lin_reg.predict(some_data_prepared)
File "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/base.py", line 256, in predict
return self._decision_function(X)
File "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/base.py", line 241, in _decision_function
dense_output=True) + self.intercept_
File "/usr/local/lib/python3.5/dist-packages/sklearn/utils/extmath.py", line 135, in safe_sparse_dot
ret = a * b
File "/usr/local/lib/python3.5/dist-packages/scipy/sparse/base.py", line 387, in __mul__
raise ValueError('dimension mismatch')
ValueError: dimension mismatch
PS: Problem I am facing in this is at the almost end of this code
"predictions_some_data = lin_reg.predict(some_data_prepared)"
import pandas as pd
import os, sys
import numpy as np
import matplotlib.pyplot as plt
import data_visualize
from sklearn.model_selection import StratifiedShuffleSplit
import dataPrep
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
**## Loading the data**
KC_housing_path = "/media/JBook/Software/PythonProjects/KCH/datasets"
def load_housing_data(housing_path=KC_housing_path):
'''if not os.path.isfile("datasets/kc_house_data.csv"):
print("Check file location, program exiting..")
else:
csv_path = os.path.join(housing_path, "kc_house_data.csv")
print("reading csv file ...")
return pd.read_csv(csv_path)'''
try:
csv_path = os.path.join(housing_path, "kc_house_data.csv")
print("reading csv file -->")
return pd.read_csv(csv_path)
except FileNotFoundError:
print("Check file location, program exiting ...")
sys.exit()
**### Defining 2 classes for custom transformers**
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attributeNames):
self.attributes = attributeNames
# print('\n In constructor', self.attributes)
def fit(self, X, y=None):
# print("__DF Fit Method:\n", (X[self.attributes].values).shape)
return self
def transform(self, X):
# print("__Transform Method:\n", (X[self.attributes].values).shape)
return X[self.attributes].values
class LabelBinarizerPipelineFriendly(LabelBinarizer):
def fit(self, X, y=None):
# print("LB-->X.shape", X.shape)
"""this would allow us to fit the model based on the X input."""
super(LabelBinarizerPipelineFriendly, self).fit(X)
def transform(self, X, y=None):
# print("LB-Transform-X.shape", X.shape)
return super(LabelBinarizerPipelineFriendly, self).transform(X)
def fit_transform(self, X, y=None):
# print("LB-FIT_TRANSFORM-X.Shape", X.shape)
return super(LabelBinarizerPipelineFriendly, self).fit(X).transform(X)
def main():
# Loading House data
housing = load_housing_data()
housing_labels = housing['price']
# Removing not needed features & label ( price)
rem_attributes = ['id', 'date', 'price']
housing_col_removed = housing.drop(rem_attributes, axis=1, inplace=False)
**### Splitting the data**
train_set, test_set = train_test_split(housing_col_removed, test_size=0.3, random_state=42)
**#### Pipleline for numeric & categorical attribute transformations
#### Adding median to missing values & making one hot vectors of categorical attributes**
data_numeric = housing_col_removed.drop('ocean_proximity', axis=1, inplace=False)
numeric_attrib = list(data_numeric)
cat_attrib = ['ocean_proximity']
num_pipeline = Pipeline([
('selector', DataFrameSelector(numeric_attrib)),
('imputing', Imputer(missing_values=0, strategy='median')),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attrib)),
('label_Bin', LabelBinarizerPipelineFriendly(sparse_output=True)),
])
full_pipeline = FeatureUnion(transformer_list=[
('num_pipeline', num_pipeline),
('cat_pipeline', cat_pipeline),
])
**#### Fitting the linear regression model**
# print('This housing data is passed to prepare\n', housing_col_removed.head())
housing_prepared = dataPrep.prepData(housing_col_removed)
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
print('Housing Prepared Shape: \n', housing_prepared.shape)
print('\t\t\t\t\************* Predictions from Linear Regression Are ***********\n',lin_reg.predict(housing_prepared))
**### Below section is trying to use some data (5 rows ) from whole data set to predict values**
some_data = housing_col_removed[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.fit_transform(some_data)
print('\t\t\tSome Data Prepared is\n', some_data_prepared)
predictions_some_data = lin_reg.predict(some_data_prepared)
print('\t\t\t\t\************* Predictions from Linear Regression Are ***********\n', predictions_some_data)
# print('\t\t\t\t\************* Labels Are ***********\n', list(some_labels))
main()