How can I save a LibSVM python object instance? - python

I wanted to use this classifier in other computer without had to train it again.
I used to save some classifiers from scikit with cPickle.
Doing the same with LIBSVM it gives me a " ValueError: ctypes objects containing pointers cannot be pickled ".
I'm using LibSVM 3.1 and Python 2.7.3.
Thanks
from libsvm.svm import *
from libsvm.svmutil import *
import cPickle
x = [[1, 0, 1], [-1, 0, -1]]
y = [1, -1]
prob = svm_problem(y, x)
param = svm_parameter()
param.kernel_type = LINEAR
param.C = 10
m = svm_train(prob, param)
labels_pred, acc, probs = svm_predict([-1, 1], [[1, 1, 1], [0, 0, 1]], m)
print labels_pred, acc, probs
import ipdb; ipdb.set_trace()
filename='libsvm-classif.pkl'
fid = open(filename, 'wb')
cPickle.dump(m, fid)
fid.close()
fid = open(filename, 'rb')
m = cPickle.load(fid)
labels_pred, acc, probs = svm_predict([-1, 1], [[1, 1, 1], [0, 0, 1]], m)
print labels_pred, acc, probs

Just use libsvm's load and save functions
svm_save_model('libsvm.model', m)
m = svm_load_model('libsvm.model')
This is from the README file included in the python directory of the libsvm package. It seems to have a much better description of features than the website.

Related

Training New AutoTokenizer Hugging Face

Getting this error: AttributeError: 'GPT2Tokenizer' object has no
attribute 'train_new_from_iterator'
Very similar to hugging face documentation. I changed the input and that's it (shouldn't affect it). It worked once. Came back to it 2 hrs later and it doesn't... nothing was changed NOTHING. Documentation states train_new_from_iterator only works with 'fast' tokenizers and that AutoTokenizer is supposed to pick a 'fast' tokenizer by default. My best guess is, it is having some trouble with this. I also tried downgrading transformers and reinstalling to no success. df is just one column of text.
from transformers import AutoTokenizer
import tokenizers
def batch_iterator(batch_size=10, size=5000):
for i in range(100): #2264
query = f"select note_text from cmx_uat.note where id > {i * size} limit 50;"
df = pd.read_sql(sql=query, con=cmx_uat)
for x in range(0, size, batch_size):
yield list(df['note_text'].loc[0:5000])[x:x + batch_size]
old_tokenizer = AutoTokenizer.from_pretrained('roberta')
training_corpus = batch_iterator()
new_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 32000)
There are two things for keeping in mind:
First: The train_new_from_iterator works with fast tokenizers only.
(here you can read more)
Second: The training corpus. Should be
a generator of batches of texts, for instance, a list of lists of
texts if you have everything in memory. (official documents)
def batch_iterator(batch_size=3, size=8):
df = pd.DataFrame({"note_text": ['fghijk', 'wxyz']})
for x in range(0, size, batch_size):
yield df['note_text'].to_list()
old_tokenizer = AutoTokenizer.from_pretrained('roberta-base')
training_corpus = batch_iterator()
new_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 32000)
print(old_tokenizer( ['fghijk', 'wxyz']))
print(new_tokenizer( ['fghijk', 'wxyz']))
output:
{'input_ids': [[0, 506, 4147, 18474, 2], [0, 605, 32027, 329, 2]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}
{'input_ids': [[0, 22, 2], [0, 21, 2]], 'attention_mask': [[1, 1, 1], [1, 1, 1]]}

Using LinearOperator as nonlinear constraints in Scipy optimize

I'm trying to use the optimization module in SciPy to solve constrained optimization problem. I need to implement the 'hess' argument. In scipy's documentation and tutorial, their hessian are simply [[2, 0], [0, 0]] and [[2, 0], [0, 0]]. However, my hessian is something like [[(-24)*x[0]**2 + 48*x[0]-16, 0], [0, 0]] and [[(-48)*x[0]**2 + 192*x[0]-176, 0], [0, 0]] so that I cannot simply use numpy.array to do multiplication. It seems that I should send a LinearOperator object to the 'hess' arguement. Examples of using LinearOperator is unclear in both scipy.optimize tutorial and LinearOperator documentation since they only show examples of lower dimension. I'm wondering how to correctly use it?
The problem formulation is
my code is:
import numpy as np
from scipy.optimize import Bounds
from scipy.optimize import NonlinearConstraint
from scipy.optimize import minimize
def f(x):
return (-x[0]-x[1])
def grad(x):
return np.array([-1, -1])
def hess(x):
return np.array([[0, 0], [0, 0]])
def cons_f(x):
return [(-2)*x[0]**4 + 8*x[0]**3 + (-8)*x[0]**2 + x[1] -2, (-4)*x[0]**4 + 32*x[0]**3 + (-88)*x[0]**2 + 96*x[0] + x[1] -36]
def cons_Jacobian(x):
return [[(-8)*x[0]**3 + 24*x[0]**2 - 16*x[0], 1], [(-16)*x[0]**3 + 96*x[0]**2 - 176*x[0] +96, 1]]
def cons_Hessian(x,v):
# TODO
return v[0]*[[(-24)*x[0]**2 + 48*x[0]-16, 0], [0, 0]] + v[1]*[[(-48)*x[0]**2 + 192*x[0]-176, 0], [0, 0]]
nonlinear_constraint = NonlinearConstraint(cons_f, -np.inf, 0, jac=cons_Jacobian, hess=cons_Hessian)
bounds = Bounds([0, 0], [3.0, 4.0])
x0 = np.array([0.5, 1])
res = minimize(f, x0, method='trust-constr', jac=grad, hess=hess,
constraints=[nonlinear_constraint],bounds=bounds)
The cons_Hessian(x,v)is absolutely wrong in my code.
In their example, although hessians are simply[[2, 0], [0, 0]] and [[2, 0], [0, 0]], the usage is confusing. I don't understand where p comes in.
from scipy.sparse.linalg import LinearOperator
def cons_H_linear_operator(x, v):
def matvec(p):
return np.array([p[0]*2*(v[0]+v[1]), 0])
return LinearOperator((2, 2), matvec=matvec)
nonlinear_constraint = NonlinearConstraint(cons_f, -np.inf, 1,
jac=cons_J, hess=cons_H_linear_operator)
There's no need to use a LinearOperator. You only need to ensure that cons_f, cons_Jacobian and cons_Hessian return np.ndarrays. That's the reason why you can't evaluate your cons_Hessian. Additionally, it's highly recommended to use double literals instead of integers, i.e. -2.0 instead of 2 to prevent that the function returns np.ndarrays with a integer dtype.
Your example works for me by writing these functions as follows:
def cons_f(x):
con1 = (-2.0)*x[0]**4 + 8*x[0]**3 + (-8)*x[0]**2 + x[1] - 2
con2 = (-4)*x[0]**4 + 32*x[0]**3 + (-88)*x[0]**2 + 96*x[0] + x[1] -36
return np.array([con1, con2])
def cons_Jacobian(x):
con1_grad = [(-8.0)*x[0]**3 + 24*x[0]**2 - 16*x[0], 1]
con2_grad = [(-16)*x[0]**3 + 96*x[0]**2 - 176*x[0] +96, 1]
return np.array([con1_grad, con2_grad])
def cons_Hessian(x,v):
con1_hess = np.array([[(-24.0)*x[0]**2 + 48*x[0]-16, 0], [0, 0]])
con2_hess = np.array([[(-48)*x[0]**2 + 192*x[0]-176, 0], [0, 0]])
return v[0]*con1_hess + v[1]*con2_hess

How to Matching Labels Cluster with True Labels with K-Means using python

I have A problem with labels data with Kmeans Algorithm. My test Sentences got the True Cluster, But i didn't get the true labels. i already using numpy for matching the cluster with the true_label_test, but this kmeans can moving the cluster, the true labels doesn't match with the number of cluster. I need help for this problem. here's my code
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
import numpy as np
from collections import Counter
stop = set(stopwords.words('indonesian'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
# Cleaning the text sentences so that punctuation marks, stop words & digits are removed
def clean(doc):
stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
processed = re.sub(r"\d+","",normalized)
y = processed.split()
#print (y)
return y
path = "coba.txt"
train_clean_sentences = []
fp = open(path,'r')
for line in fp:
line = line.strip()
cleaned = clean(line)
cleaned = ' '.join(cleaned)
train_clean_sentences.append(cleaned)
#print(train_clean_sentences)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_clean_sentences)
# Clustering the training 30 sentences with K-means technique
modelkmeans = KMeans(n_clusters=3, init='k-means++', max_iter=200, n_init=100)
modelkmeans.fit(X)
teks_satu = "Aplikasi Machine Learning untuk mengenali daun mangga dengan metode CNN"
test_clean_sentence = []
cleaned_test = clean(teks_satu)
cleaned = ' '.join(cleaned_test)
cleaned = re.sub(r"\d+","",cleaned)
test_clean_sentence.append(cleaned)
Test = vectorizer.transform(test_clean_sentence)
true_test_labels = ['AI','VR','Sistem Informasi']
predicted_labels_kmeans = modelkmeans.predict(Test)
print(predicted_labels_kmeans)
print ("\n-------------------------------PREDICTIONS BY K-Means--------------------------------------")
print ("\nIndex of Virtual Reality : ",Counter(modelkmeans.labels_[5:10]).most_common(1)[0][0])
print ("Index of Machine Learning : ",Counter(modelkmeans.labels_[0:5]).most_common(1)[0][0])
print ("Index of Sistem Informasi : ",Counter(modelkmeans.labels_[10:15]).most_common(1)[0][0])
print ("\n",teks_satu,":",true_test_labels[np.int(predicted_labels_kmeans)],":",predicted_labels_kmeans)
I had the same problem: my cluster (kmeans) did return different classes (cluster numbers) then the true classes. The result that the true label and predicted labels didn't match. The solution that worked for me was this code (scroll to 'Permutation maximizing the sum of the diagonal elements'). Although this methods works wel, there can be situations where it is wrong I think.
Here is a concrete example showing how to match KMeans cluster ids with training data labels. The underlying idea is confusion_matrixshall have large values on its diagonal line assuming that classification is done correctly. Here is the confusion matrix before associating cluster center ids with training labels:
cm =
array([[ 0, 395, 0, 5, 0],
[ 0, 2, 5, 391, 2],
[ 2, 0, 0, 0, 398],
[ 0, 0, 400, 0, 0],
[398, 0, 0, 0, 2]])
Now we just need to reorder the confusion matrix to make its large values relocate on the diagonal line. It can be achieved easily with
cm_argmax = cm.argmax(axis=0)
cm_argmax
y_pred_ = np.array([cm_argmax[i] for i in y_pred])
Here we get the new confusion matrix, which looks much familiar now, right?
cm_ =
array([[395, 5, 0, 0, 0],
[ 2, 391, 2, 5, 0],
[ 0, 0, 398, 0, 2],
[ 0, 0, 0, 400, 0],
[ 0, 0, 2, 0, 398]])
You can further verify the result with accuracy_score
y_pred_ = np.array([cm_argmax[i] for i in y_pred])
accuracy_score(y,y_pred_)
# 0.991
The entire standalone code is here:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import confusion_matrix,accuracy_score
blob_centers = np.array(
[[ 0.2, 2.3],
[-1.5 , 2.3],
[-2.8, 1.8],
[-2.8, 2.8],
[-2.8, 1.3]])
blob_std = np.array([0.4, 0.3, 0.1, 0.1, 0.1])
X, y = make_blobs(n_samples=2000, centers=blob_centers,
cluster_std=blob_std, random_state=7)
def plot_clusters(X, y=None):
plt.scatter(X[:, 0], X[:, 1], c=y, s=1)
plt.xlabel("$x_1$", fontsize=14)
plt.ylabel("$x_2$", fontsize=14, rotation=0)
plt.figure(figsize=(8, 4))
plot_clusters(X)
plt.show()
k = 5
kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(X)
cm = confusion_matrix(y, y_pred)
cm
cm_argmax = cm.argmax(axis=0)
cm_argmax
y_pred_ = np.array([cm_argmax[i] for i in y_pred])
cm_ = confusion_matrix(y, y_pred)
cm_
accuracy_score(y,y_pred_)
you can assign label of majority of true labels in each clusterto to that cluster

Spyde import module but not able to use it

I am getting a very strange error and I can't really understand why.
I can successfully import a module named "filterpy", but when I run the code I get the error: module 'filterpy' has no attribute 'kalman'
Fun fact, the spyder editor tells me that such module exist, in fact it allows me to use even self-completion.
Am I missing something?
Thanks,
Gabriele
The code is below, the library can be installed with pip
import numpy as np
import filterpy as fp
def fx(x, dt):
# state transition function - predict next state based
# on constant velocity model x = vt + x_0
F = np.array([[1, dt, 0, 0], [0, 1, 0, 0], [0, 0, 1, dt], [0, 0, 0, 1]], dtype=float)
return np.dot(F, x)
def hx(x):
# measurement function - convert state into a measurement
# where measurements are [x_pos, y_pos]
return np.array([x[0], x[2]])
dt = 0.1
# create sigma points to use in the filter. This is standard for Gaussian processes
points = fp.kalman.MerweScaledSigmaPoints(4, alpha=.1, beta=2., kappa=-1)
kf = fp.kalman.UnscentedKalmanFilter(dim_x=4, dim_z=2, dt=dt, fx=fx, hx=hx, points=points)
kf.x = np.array([-1., 1., -1., 1]) # initial state
kf.P *= 0.2 # initial uncertainty
z_std = 0.1
kf.R = np.diag([z_std**2, z_std**2]) # 1 standard
kf.Q = fp.common.Q_discrete_white_noise(dim=2, dt=dt, var=0.01**2, block_size=2)
zs = [[i+np.random.randn()*z_std, i+np.random.randn()*z_std] for i in range(50)] # measurements
for z in zs:
kf.predict()
kf.update(z)
print(kf.x, 'log-likelihood', kf.log_likelihood)

Spyder compiler displays wrong output

So I am trying to run the following code:
import numpy as np
import numpy.linalg as la
x = np.array ( [ [1, 0, 1], [0, 1, 1], [0, 0, 1], [1, 1, 1]] )
y = np.array ( [1, 1, 0, 0] )
# solve using normal equations:
x_transpose = np.transpose(x) #calculating transpose
x_transpose_dot_x = x_transpose.dot(x) # calculating dot product
temp_1 = la.inv(x_transpose_dot_x) #calculating inverse
temp_2 = x_transpose.dot(y)
theta = temp_1.dot(temp_2)
print(theta)
The output using spyder IDE:
[2.22044605e-16 1.11022302e-16 5.00000000e-01]
The output using collab or py IDE:
[0.00000000e+00 2.22044605e-16 5.00000000e-01]
Why is spyder producing a wrong output? I like using spyder tho!

Categories

Resources