We were given some code for a support vector machine where we are supposed to implement leave one out cross validation. If I understand it correctly leave one out will create as many test sets as there are samples, which means that for a big data set the process will be costly and most likely take quite long to generate results.
I have tried to implement leave one out to the given svm code with only one iteration and with 773 data points in total. I expected it to take some time but as of 2 h later the code is still running without any result, which makes me believe that it might be stuck in some loop or something...
Is there any suggestion as to what might be wrong? I'm not getting any error code either.
The entire code is as following, with the leave one out part is in the last function at the bottom (executed in jupyter notebook online binder):
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gseapy as gp
from gseapy.plot import gseaplot
import qvalue
from ipywidgets import interact, interact_manual
from ipywidgets import IntSlider, FloatSlider, Dropdown, Text
import sklearn as skl
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import LeaveOneOut
from sklearn import svm
interact_enrich=interact_manual.options(manual_name="Enrichment analysis")
interact_plot=interact_manual.options(manual_name="Plot")
interact_calc=interact_manual.options(manual_name="Calculate tests")
interact_gen=interact_manual.options(manual_name="Initialize data")
interact_SVM=interact_manual.options(manual_name="Train SVM")
clinical_data = pd.read_csv('../data/brca_clin.tsv.gz', sep ='\t', index_col=2)
clinical_data = clinical_data.iloc[4:,1:]
expression_data = pd.read_csv('../data/brca.tsv.gz', sep ='\t', index_col=1)
expression_data = expression_data.iloc[:,2:].T
def split_data(clinical_df, expression_df, separator, cond1, cond2):
try:
group1 = clinical_df[separator] == cond1
index1 = clinical_df[group1].index
group2 = clinical_df[separator] == cond2
index2 = clinical_df[group2].index
except:
print('Clinical condition wrong')
expression1 = expression_df.loc[index1].dropna()
expression2 = expression_df.loc[index2].dropna()
expression = pd.concat([expression1, expression2])
X = expression.values
y = np.append(np.repeat(0, len(expression1)), np.repeat(1, len(expression2)))
display(pd.DataFrame([len(index1),len(index2)], columns = ['Number of points'], index = ['Group 1', 'Group 2']))
return X, y
def plot_pca_variance(X, scale=False, ncomp = 1):
if scale:
scaler = StandardScaler()
X = scaler.fit_transform(X)
pca = PCA()
pca.fit(X)
plt.rcParams["figure.figsize"] = (20,10)
sns.set(style='darkgrid', context='talk')
plt.plot(np.arange(1,len(pca.explained_variance_ratio_)+1),np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.vlines(ncomp, 0, plt.gca().get_ylim()[1], color='r', linestyles = 'dashed')
h = np.cumsum(pca.explained_variance_ratio_)[ncomp -1]
plt.hlines(h, 0, plt.gca().get_xlim()[1], color='r', linestyles = 'dashed')
plt.title(str(ncomp) + ' components, ' + str(round(h, 3)) + ' variance explained')
plt.show()
def reduce_data(X, n, scale=True):
if scale:
scaler = StandardScaler()
X = scaler.fit_transform(X)
pca = PCA(n_components=n)
Xr = pca.fit_transform(X)
return Xr
def interact_split_data(Criteria, Group_1, Group_2):
global BRCA_X, BRCA_y
BRCA_X, BRCA_y = split_data(clinical_data, expression_data, Criteria, Group_1, Group_2)
def interact_SVM_1(Rescale, Max_iterations):
max_iter = int(Max_iterations)
loo = LeaveOneOut()
ac_matrix_train, ac_matrix_test = np.array([]), np.array([])
for train_id, test_id in loo.split(BRCA_X, BRCA_y):
X_train, X_test, y_train, y_test = BRCA_X[train_id,:], BRCA_X[test_id,:], BRCA_y[train_id],BRCA_y[test_id]
clf = svm.LinearSVC(C=0.1,max_iter=100000).fit(X_train, y_train) # Train an SVM
y_train_pred = clf.predict(X_train)
ac_matrix_train = confusion_matrix(y_train, y_train_pred)
y_test_pred = clf.predict(X_test)
ac_matrix_test = confusion_matrix(y_test, y_test_pred)
display(pd.DataFrame(np.concatenate((ac_matrix_train,ac_matrix_test), axis =1), columns = ["predicted G1 (training)","predicted G2 (training)", "predicted G1 (test)","predicted G2 (test)"],index=["actual G1","actual G2"]))
interact_gen(interact_split_data, Criteria=Text('PR status by ihc'), Group_1 = Text('Positive'), Group_2=Text('Negative'))
interact_SVM(interact_SVM_1, Rescale = False, Max_iterations = Text('1')) ```
Related
I am performing least squares classification on my data and I was able to obtain my weights and I decided to plot a decision boundary line. However I require to use a confusion matrix to show my classification results. I was going to use from sklearn.metrics import confusion_matrix and I was going to assign t as my prediction however I am not sure how to obtain my actual results to work out the matrix. I have never plotted one so I might be getting all this wrong.
import numpy as np
import matplotlib.pyplot as plt
data=np.loadtxt("MyData_A.txt")
x=data[:,0:2] #the data points
t=data[:,2] #class which data points belong to either 1s or 0s
x0=np.ones((len(x),1)) # creat array of ones as matrix (nx1) where n is number of points
X=np.append(x, x0, axis=1) # add column x0 to data
# w= ( (((X^T)X)^-1 )X^T )t
XT_X=np.dot(X.T, X) # (X^T)X
inv_XT_X=np.linalg.inv(XT_X) # (X^T)X)^-1
X_tot=np.dot(inv_XT_X, X.T) # ((X^T)X)^-1 )X^T
w=np.dot(X_tot, t) # ( (((X^T)X)^-1 )X^T )t
x1_line = np.array([-1, 2])
x2_line = -w[2] / w[1] - (w[0] / w[1]) * x1_line
color_cond=['r' if t==1 else 'b' for t in t]
plt.scatter(x[:,0],x[:,1],color=color_cond)
plt.plot(x1_line,x2_line,color='k')
plt.xlabel('X1')
plt.ylabel('X2')
plt.ylim(-2,2)
plt.title('Training Data (X1,X2)')
plt.show()
The following is the plot obtained.
from sklearn.metrics import confusion_matrix
import seaborn as sns
def predict(x1_line, x2_line, x):
d = (x[0] - x1_line[0]) * (x2_line[1] - x2_line[0]) - (x[1] - x2_line[0]) * (x1_line[1] - x1_line[0])
pred = 0 if d > 0 else 1
return pred
preds = np.array([predict(x1_line, x2_line, x12) for x12 in x])
conf_mat = confusion_matrix(t, preds)
sns.heatmap(conf_mat, annot=True);
plt.show()
LogisticRegression, confusion_matrix and ConfusionMatrixDisplay get the job done:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
data = np.loadtxt("MyData_A.txt")
X = data[:, :-1]
y = data[:, -1].astype(int)
clf = LogisticRegression().fit(X, y)
pred = clf.predict(X)
cm = confusion_matrix(y, pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
I want to make my trees simpler, wondering to plot trees without showing samples (e.g. 83) and values (e.g.[34,53,29,26])? (I don't want the last two lines)
Here is a part of the current code of plotting trees.
X = df.iloc[:,0: -1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
clf = RandomForestClassifier()
clf.fit(X_train,y_train)
.
.
.
.
# Here, I guess I need to add some commands.
plot_tree(clf.estimators_[5],
feature_names=X.columns,
class_names=names,
filled=True,
impurity=True,
rounded=True,
max_depth = 3)
Let's say we have a dataset like this, and we assign the matplotlib axis using ax = argument:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import matplotlib.pyplot as plt
import re
import matplotlib
fig, ax = plt.subplots(figsize=(8,5))
clf = RandomForestClassifier(random_state=0)
iris = load_iris()
clf = clf.fit(iris.data, iris.target)
tree.plot_tree(clf.estimators_[0],ax=ax,
feature_names= iris.feature_names, class_names=iris.target_names)
Not sure if it is the best way, one way is to go under ax.properties() and edit the text:
def replace_text(obj):
if type(obj) == matplotlib.text.Annotation:
txt = obj.get_text()
txt = re.sub("\nsamples[^$]*class","\nclass",txt)
obj.set_text(txt)
return obj
ax.properties()['children'] = [replace_text(i) for i in ax.properties()['children']]
fig.show()
Small improvement to the above proposition from #StupidWolf.
If there are a lot of classes, value = [...] is split into multiple lines:
value = [100, 0, 0, 0, 0,
6, 7, 0, 0, 0, 0,
0, 13]
So rather than substitution of text with re.sub(...), I check which line starts value section:
def replace_text(obj):
if type(obj) == matplotlib.text.Annotation:
txt = obj.get_text()
_lines = txt.splitlines()
_result = []
value_index = None
class_index = None
for i, _line in enumerate(_lines):
if "value" in _line:
value_index = i
if "class" in _line:
class_index = i
assert value_index and class_index
_result = _lines[:value_index] + _lines[class_index:]
obj.set_text("\n".join(_result))
return obj
I am playing around with a dbscan example in order to see if it will work for me. In my case, I have clusters of a few points (3-5) close together with a fairly long distance in between clusters. I have tried to replicate the situation in the following code. I figured with a low epsilon and low min_samples,this should work, but instead it is telling me that it only sees 1 group (and 20 noise points?). Am I using this incorrectly, or is dbscan not good for this type of problem. I went with dbscan instead of kmeans because I dont know beforehand exactly how many clusters there will be (1-5).
from sklearn.datasets import make_blobs
from sklearn.cluster import DBSCAN
import numpy as np
import matplotlib.pyplot as plt
# Configuration options
num_samples_total = 20
cluster_centers = [(3,3), (7,7),(7,3),(3,7),(5,5)]
num_classes = len(cluster_centers)
#epsilon = 1.0
epsilon = 1e-5
#min_samples = 13
min_samples = 2
# Generate data
X, y = make_blobs(n_samples = num_samples_total, centers = cluster_centers, n_features = num_classes, center_box=(0, 1), cluster_std = 0.05)
np.save('./clusters.npy', X)
X = np.load('./clusters.npy')
# Compute DBSCAN
db = DBSCAN(eps=epsilon, min_samples=min_samples).fit(X)
labels = db.labels_
no_clusters = len(np.unique(labels) )
no_noise = np.sum(np.array(labels) == -1, axis=0)
print('Estimated no. of clusters: %d' % no_clusters)
print('Estimated no. of noise points: %d' % no_noise)
# Generate scatter plot for training data
colors = list(map(lambda x: '#3b4cc0' if x == 1 else '#b40426', labels)) #only set for 2 colors
plt.scatter(X[:,0], X[:,1], c=colors, marker="o", picker=True)
plt.title('Two clusters with data')
plt.xlabel('Axis X[0]')
plt.ylabel('Axis X[1]')
plt.show()
ended up going with kmeans and doing a modified elbow method:
print(__doc__)
# Author: Phil Roth <mr.phil.roth#gmail.com>
# License: BSD 3 clause
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
# Configuration options
num_samples_total = 20
cluster_centers = [(3,3), (7,7),(7,3),(3,7),(5,5)]
num_classes = len(cluster_centers)
#epsilon = 1.0
epsilon = 1e-5
#min_samples = 13
min_samples = 2
# Generate data
X, y = make_blobs(n_samples = num_samples_total, centers = cluster_centers, n_features = num_classes, center_box=(0, 1), cluster_std = 0.05)
random_state = 170
#y_pred = KMeans(n_clusters=5, random_state=random_state).fit_predict(X)
#plt.scatter(X[:, 0], X[:, 1], c=y_pred)
#kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
#maybe I dont have to look for an elbow, just go until the value drops below 1.
#also if I do go too far, it just means that the same shape will be shown twice.
clusterIdx = 0
inertia = 100
while inertia > 1:
clusterIdx = clusterIdx + 1
kmeans = KMeans(n_clusters=clusterIdx, random_state=0).fit(X)
inertia = kmeans.inertia_
print(inertia)
plt.scatter(X[:, 0], X[:, 1], c=kmeans.labels_)
print(clusterIdx)
plt.show()
I have wrote a code that will give me the best number of clusters based on max value of silhouette_score. Now I want to find out how many values each cluster have. For example, my result is that the optimal number of clusters is 3, I want to find out how many values each cluster have, for example first cluster has 1241 values second 3134 values and third 351 values.
Is it possible to do something like that?
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import scale
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.cluster import KMeans, MiniBatchKMeans, AffinityPropagation
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.decomposition import PCA
df = pd.read_csv('CNN Comments.csv')
df = df.head(8000)
#print(df)
x = df['Text Data']
cv = TfidfVectorizer(analyzer = 'word',max_features = 10000, preprocessor=None, lowercase=True, tokenizer=None, stop_words = 'english')
#cv = CountVectorizer(analyzer = 'word', max_features = 8000, preprocessor=None, lowercase=True, tokenizer=None, stop_words = 'english')
x = cv.fit_transform(x)
my_list = []
list_of_clusters = []
for i in range(2,5):
kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
kmeans.fit(x)
my_list.append(kmeans.inertia_)
cluster_labels = kmeans.fit_predict(x)
silhouette_avg = silhouette_score(x, cluster_labels) * 100
print(round(silhouette_avg,2))
list_of_clusters.append(round(silhouette_avg, 1))
plt.plot(range(2,5),my_list)
plt.show()
number_of_clusters = max(list_of_clusters)
number_of_clusters = list_of_clusters.index(number_of_clusters)+2
print('Number of clusters: ', number_of_clusters)
You can use the array assigned to cluster_labels to get the distribution of cluster assignments. I would recommend using Counter from the collections module.
from collections import Counter
...
cluster_labels = kmeans.fit_predict(x)
cluster_counts = Counter(cluster_labels)
The alternativ with numpy:
import numpy as np
...
unique, counts = np.unique(kmeans.fit_predict(x), return_counts=True)
print(dict(zip(unique, counts)))
As title says when running the following code i get a trouble Found input variables with inconsistent numbers of samples: [219, 247], i have read that the problem should be on the np.array set for X and y, but i cannot address the problem because there is a price for every date so i dont get why it is happening, any help will be appreciated thanks!
import pandas as pd
import quandl, math, datetime
import numpy as np
from sklearn import preprocessing, svm, model_selection
from sklearn.linear_model import LinearRegression
import matplotlib as plt
from matplotlib import style
style.use('ggplot')
df = quandl.get("NASDAQOMX/XNDXT25NNR", authtoken='myapikey')
df = df[['Index Value','High','Low','Total Market Value']]
df['HL_PCT'] = (df['High'] - df['Low']) / df['Index Value'] * 100.0
df = df[['Low','High','HL_PCT']]
forecast_col = 'High'
df.fillna(-99999, inplace=True)
forecast_out = int(math.ceil(0.1*len(df)))
df['label'] = df[forecast_col].shift(-forecast_out)
df.dropna(inplace= True)
X = np.array(df.drop(['label'],1))
X = preprocessing.scale(X)
X_lately = X[-forecast_out:]
X = X[:-forecast_out]
y=np.array(df['label'])
#X= X[:-forecast_out+1]
df.dropna(inplace=True)
y= np.array(df['label'])
X_train, X_test, y_train, y_test= model_selection.train_test_split(X,
y,test_size=0.2)
clf= LinearRegression(n_jobs=-1)
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
forecast_set= clf.predict(X_lately)
print(forecast_set, accuracy, forecast_out)
df['Forecast'] = np.nan
last_data= df.iloc[-1].name
last_unix= last_date.timestamp()
one_day=86400
next_unix= last_unix + one_day
for i in forecast_set:
next_date= datetime.datetime.fromtimestamp(next_unix)
next_unix += one_day
df.loc[next_date]= [np.nan for _ in range(len(df.columns) -1)] +
[i]
df['High'].plot()
df['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()
the expected result should be a plot of future price prediction for that ticker but besides that it is throwing that error 'Found input variables with inconsistent numbers of samples: [219, 247]'.
Your problem lies in these two lines extracted from your code:
X = X[:-forecast_out]
y= np.array(df['label'])
You're subsetting X, but leaving y "as it is".
You may check that shapes differ indeed by:
X.shape, y.shape
Change the last line to:
y= np.array(df[:-forecast_out]['label'])
and you're fine.
Note as well, instead of these repetitive lines:
y=np.array(df['label'])
#X= X[:-forecast_out+1]
df.dropna(inplace=True) # there is no na at this point
y= np.array(df['label'])
the following line (solution to your problem) is just enough:
y= np.array(df[:-forecast_out]['label'])