IndexError when ploting sklearn manifold TSNE - python

I try to run a t-sne but python shows me this error:
IndexError: only integers, slices (:), ellipsis (...), numpy.newaxis (None) and integer or boolean arrays are valid indices
Data is being provided by this link.
Here's the code:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
#Step 1 - Download the data
dataframe_all = pd.read_csv('https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv')
num_rows = dataframe_all.shape[0]
#Step 2 - Clearn the data
#count the number of missing elements (NaN) in each column
counter_nan = dataframe_all.isnull().sum()
counter_without_nan = counter_nan[counter_nan==0]
#remove the columns with missing elements
dataframe_all = dataframe_all[counter_without_nan.keys()]
#remove the first 7 columns which contain no descriminative information
dataframe_all = dataframe_all.ix[:,7:]
#Step 3: Create feature vectors
x = dataframe_all.ix[:,:-1].values
standard_scalar = StandardScaler()
x_std = standard_scalar.fit_transform(x)
# t distributed stochastic neighbour embedding (t-SNE) visualization
tsne = TSNE(n_components=2, random_state = 0)
x_test_2d = tsne.fit_transform(x_std)
#scatter plot the sample points among 5 classes
markers=('s','d','o','^','v')
color_map = {0:'red', 1:'blue', 2:'lightgreen', 3:'purple', 4:'cyan'}
plt.figure()
for idx, cl in enumerate(np.unique(x_test_2d)):
plt.scatter(x=x_test_2d[cl, 0],y =x_test_2d[cl, 1], c=color_map[idx], marker=markers[idx], label=cl)
plt.show()
What do I have to change in order to make this work?

The error is due to the following line:
plt.scatter(x_test_2d[cl, 0], x_test_2d[cl, 1], c=color_map[idx], marker=markers[idx])
Here, cl can take and takes not integer values (from np.unique(x_test_2d)) and this raises the error, e.g. the last value that cl takes is 99.46295 and then you use: x_test_2d[cl, 0] which translates into x_test_2d[99.46295, 0]
Define a variable y that hold the class labels, then use:
# variable holding the classes
y = dataframe_all.classe.values
y = np.array([ord(i) for i in y])
#scatter plot the sample points among 5 classes
plt.figure()
plt.scatter(x_test_2d[:, 0], x_test_2d[:, 1], c = y)
plt.show()
FULL CODE:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
#Step 1 - Download the data
dataframe_all = pd.read_csv('https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv')
num_rows = dataframe_all.shape[0]
#Step 2 - Clearn the data
#count the number of missing elements (NaN) in each column
counter_nan = dataframe_all.isnull().sum()
counter_without_nan = counter_nan[counter_nan==0]
#remove the columns with missing elements
dataframe_all = dataframe_all[counter_without_nan.keys()]
#remove the first 7 columns which contain no descriminative information
dataframe_all = dataframe_all.ix[:,7:]
#Step 3: Create feature vectors
x = dataframe_all.ix[:,:-1].values
standard_scalar = StandardScaler()
x_std = standard_scalar.fit_transform(x)
# t distributed stochastic neighbour embedding (t-SNE) visualization
tsne = TSNE(n_components=2, random_state = 0)
x_test_2d = tsne.fit_transform(x_std)
# variable holding the classes
y = dataframe_all.classe.values # you need this for the colors
y = np.array([ord(i) for i in y]) # convert letters to numbers
#scatter plot the sample points among 5 classes
plt.figure()
plt.scatter(x_test_2d[:, 0], x_test_2d[:, 1], c = y)
plt.show()

Related

Why isn't this Linear Regression line a straight line?

I have points with x and y coordinates I want to fit a straight line to with Linear Regression but I get a jagged looking line.
I am attemting to use LinearRegression from sklearn.
To create the points run a for loop that randomly crates one hundred points into an array that is 100 x 2 in shape. I slice the left side of it for the xs and the right side of it for the ys.
I expect to have a straight line when I print m.predict.
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.linear_model import LinearRegression
X = []
adder = 0
for z in range(100):
r = random.random() * 20
r2 = random.random() * 15
X.append([r+adder-0.4, r2+adder])
adder += 0.6
X = np.array(X)
plt.scatter(X[:,0], X[:,1], s=10)
plt.show()
m = LinearRegression()
m.fit(X[:,0].reshape(1, -1), X[:,1].reshape(1, -1))
plt.plot(m.predict(X[:,0].reshape(1, -1))[0])
I am not good with numpy but, I think it is because the use of reshape() function to convert X[:,0] and X[:,1] from 1D to 2D, the resulting 2D array contains only one element, instead of creating a 2D array of len(X[:,0]) and len(X[:,1]) respectively. And resulting into an undesired regressor.
I am able to recreate this model using pandas and able to plot the desired result. Code as follows
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.linear_model import LinearRegression
import pandas as pd
X = []
adder = 0
for z in range(100):
r = random.random() * 20
r2 = random.random() * 15
X.append([r+adder-0.4, r2+adder])
adder += 0.6
X = np.array(X)
y_train = pd.DataFrame(X[:,1],columns=['y'])
X_train = pd.DataFrame(X[:,0],columns=['X'])
//plt.scatter(X_train, y_train, s=10)
//plt.show()
m = LinearRegression()
m.fit(X_train, y_train)
plt.scatter(X_train,y_train)
plt.plot(X_train,m.predict(X_train),color='red')

dbscan not making sense for small amounts of points

I am playing around with a dbscan example in order to see if it will work for me. In my case, I have clusters of a few points (3-5) close together with a fairly long distance in between clusters. I have tried to replicate the situation in the following code. I figured with a low epsilon and low min_samples,this should work, but instead it is telling me that it only sees 1 group (and 20 noise points?). Am I using this incorrectly, or is dbscan not good for this type of problem. I went with dbscan instead of kmeans because I dont know beforehand exactly how many clusters there will be (1-5).
from sklearn.datasets import make_blobs
from sklearn.cluster import DBSCAN
import numpy as np
import matplotlib.pyplot as plt
# Configuration options
num_samples_total = 20
cluster_centers = [(3,3), (7,7),(7,3),(3,7),(5,5)]
num_classes = len(cluster_centers)
#epsilon = 1.0
epsilon = 1e-5
#min_samples = 13
min_samples = 2
# Generate data
X, y = make_blobs(n_samples = num_samples_total, centers = cluster_centers, n_features = num_classes, center_box=(0, 1), cluster_std = 0.05)
np.save('./clusters.npy', X)
X = np.load('./clusters.npy')
# Compute DBSCAN
db = DBSCAN(eps=epsilon, min_samples=min_samples).fit(X)
labels = db.labels_
no_clusters = len(np.unique(labels) )
no_noise = np.sum(np.array(labels) == -1, axis=0)
print('Estimated no. of clusters: %d' % no_clusters)
print('Estimated no. of noise points: %d' % no_noise)
# Generate scatter plot for training data
colors = list(map(lambda x: '#3b4cc0' if x == 1 else '#b40426', labels)) #only set for 2 colors
plt.scatter(X[:,0], X[:,1], c=colors, marker="o", picker=True)
plt.title('Two clusters with data')
plt.xlabel('Axis X[0]')
plt.ylabel('Axis X[1]')
plt.show()
ended up going with kmeans and doing a modified elbow method:
print(__doc__)
# Author: Phil Roth <mr.phil.roth#gmail.com>
# License: BSD 3 clause
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
# Configuration options
num_samples_total = 20
cluster_centers = [(3,3), (7,7),(7,3),(3,7),(5,5)]
num_classes = len(cluster_centers)
#epsilon = 1.0
epsilon = 1e-5
#min_samples = 13
min_samples = 2
# Generate data
X, y = make_blobs(n_samples = num_samples_total, centers = cluster_centers, n_features = num_classes, center_box=(0, 1), cluster_std = 0.05)
random_state = 170
#y_pred = KMeans(n_clusters=5, random_state=random_state).fit_predict(X)
#plt.scatter(X[:, 0], X[:, 1], c=y_pred)
#kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
#maybe I dont have to look for an elbow, just go until the value drops below 1.
#also if I do go too far, it just means that the same shape will be shown twice.
clusterIdx = 0
inertia = 100
while inertia > 1:
clusterIdx = clusterIdx + 1
kmeans = KMeans(n_clusters=clusterIdx, random_state=0).fit(X)
inertia = kmeans.inertia_
print(inertia)
plt.scatter(X[:, 0], X[:, 1], c=kmeans.labels_)
print(clusterIdx)
plt.show()

Distribution fitting of Multiple columns

I am trying to get the distribution fitting of my data using scipy.stats. The data contains multiple columns col_1, col_2, col_3 in a single CSV file.
The problem is distribution fitting only takes a single column to identify a best distribution fittings as I have shown in the below code.
How to get the distribution fitting of all columns at the same time? e.g distribution fitting ofcol_1, col_2, col_3
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import scipy
from sklearn.preprocessing import StandardScaler
import scipy.stats
import matplotlib.pyplot as plt
# Load data and select first column
from sklearn import datasets
data_set = datasets.load_breast_cancer()
# Multiple columns of csv
col_1=data_set.data[:,0]
col_2=data_set.data[:,1]
col_3=data_set.data[:,2]
# Create an index array (x) for data
x = np.arange(len(col_1))
size = len(col_1)
plt.hist(col_1)
plt.show()
sc=StandardScaler()
yy = col_1.reshape (-1,1)
sc.fit(yy)
y_std =sc.transform(yy)
y_std = y_std.flatten()
y_std
del yy
dist_names = ['beta',
'expon',
'gamma',
'lognorm',
'norm',
'pearson3',
'triang',
'uniform',
'weibull_min',
'weibull_max']
# Set up empty lists to stroe results
chi_square = []
p_values = []
# Set up 50 bins for chi-square test
# Observed data will be approximately evenly distrubuted aross all bins
percentile_bins = np.linspace(0,100,51)
percentile_cutoffs = np.percentile(y_std, percentile_bins)
observed_frequency, bins = (np.histogram(y_std, bins=percentile_cutoffs))
cum_observed_frequency = np.cumsum(observed_frequency)
# Loop through candidate distributions
for distribution in dist_names:
# Set up distribution and get fitted distribution parameters
dist = getattr(scipy.stats, distribution)
param = dist.fit(y_std)
# Obtain the KS test P statistic, round it to 5 decimal places
p = scipy.stats.kstest(y_std, distribution, args=param)[1]
p = np.around(p, 5)
p_values.append(p)
# Get expected counts in percentile bins
# This is based on a 'cumulative distrubution function' (cdf)
cdf_fitted = dist.cdf(percentile_cutoffs, *param[:-2], loc=param[-2],
scale=param[-1])
expected_frequency = []
for bin in range(len(percentile_bins)-1):
expected_cdf_area = cdf_fitted[bin+1] - cdf_fitted[bin]
expected_frequency.append(expected_cdf_area)
# calculate chi-squared
expected_frequency = np.array(expected_frequency) * size
cum_expected_frequency = np.cumsum(expected_frequency)
ss = sum (((cum_expected_frequency - cum_observed_frequency) ** 2) / cum_observed_frequency)
chi_square.append(ss)
# Collate results and sort by goodness of fit (best at top)
results = pd.DataFrame()
results['Distribution'] = dist_names
results['chi_square'] = chi_square
results['p_value'] = p_values
results.sort_values(['chi_square'], inplace=True)
# Report results
print ('\nDistributions sorted by goodness of fit:')
print ('----------------------------------------')
print (results)

my algorithm gives bad clusters while usingTF-IDF

im getting bad clusters i would like to rewrite it in a way where i can just plug in any algorithm that i would like (e.g hierarchical, knn, k-means) etc.
#takes in our text_extracts dictionary and returns clusters in an indexed list
def run_clustering(plan):
""" Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
vectorizer = TfidfVectorizer(tokenizer=process_text,
max_df=0.5,
min_df=0.005,
ngram_range=(1,4),
lowercase=True)
#set the model with the vectorizer which will tokenize with our process_text function
extracts = {}
for page in plan.page_list:
if len(page.text_extract) > 50:
extracts[str(page.document_id) + '_' + str(page.page_number)] = page.text_extract
extract_lst = [extracts[text] for text in extracts]
tfidf_model = vectorizer.fit_transform(extract_lst)
#determine cluster number with silhouette coefficient
#start with 2 as a cluster size in case the set is very small
num_of_clusters_to_test = [2]
#going to test 25 more sizes in equal intervals based on the number of docs we are clustering
intervals_to_test = int(len(extracts) / 25)
#print(intervals_to_test)
num_of_clusters_to_test += [i for i in range(len(extracts)) if i % intervals_to_test == 0 and i != 0]
#these variables will help us determine the max silhouette
#iters_since_new_max is just being held so that if we aren't reaching optimal size for
#four iterations in a row, we dont have to keep testing huge cluster sizes
max_silhouette_coef = 0
iters_since_new_max = 0
good_size = 2
#cluster with a certain cluster size and record the silhouette coefficient
for size in num_of_clusters_to_test:
kmeans = KMeans(n_clusters=size).fit(tfidf_model)
label = kmeans.labels_
sil_coeff = silhouette_score(tfidf_model, label, metric='euclidean')
if sil_coeff > max_silhouette_coef:
max_silhouette_coef = sil_coeff
good_size = size
iters_since_new_max = 0
else:
iters_since_new_max += 1
if iters_since_new_max > 4:
break
# finally cluster for with the good size we want
km_model = KMeans(n_clusters=good_size)
km_model.fit(tfidf_model)
clustering = collections.defaultdict(list)
for idx, label in enumerate(km_model.labels_):
clustering[label].append(idx)
return clustering
left as much comment as i can to help you all follow what i am going for can anyone help me improve this
You know KMeans if for numeric data only, right. I mean, don't expect it to work on labeled data. With KMeans, you calculate the distance to the nearest centroid (cluster center) and add this point to this cluster. What is the 'distance' between apple, banana, and watermelon? It doesn't make sense! So, just make sure you are running your KMeans over numerics.
import numpy as np
import pandas as pd
from pylab import plot,show
from numpy import vstack,array
from scipy.cluster.vq import kmeans,vq
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
import seaborn as sns
df = pd.read_csv('foo.csv')
# get only numeric fields from your dataframe
df = df.sample(frac=0.1, replace=True, random_state=1)
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
newdf = df.select_dtypes(include=numerics)
for col in newdf.columns:
print(col)
# your independent variables
X = newdf[['NumericField1','NumericField2','NumericField3','list_price']]
# your dependent variable
y = newdf['DependentVariable']
# take all numeric features from the corr exercise, and turn into an array
# so we can feed it into a cluetering algorythm
data = np.asarray(newdf)
X = data
# computing K-Means with K = 100 (100 clusters)
centroids,_ = kmeans(data,100)
# assign each sample to a cluster
idx,_ = vq(data,centroids)
# some plotting using numpy's logical indexing
plot(data[idx==0,0],data[idx==0,1],'ob',
data[idx==1,0],data[idx==1,1],'oy',
data[idx==2,0],data[idx==2,1],'or',
data[idx==3,0],data[idx==3,1],'og',
data[idx==4,0],data[idx==4,1],'om')
plot(centroids[:,0],centroids[:,1],'sg',markersize=8)
show()
details = [(name,cluster) for name, cluster in zip(df.brand,idx)]
for detail in details:
print(detail)
I've found Affinity Propogation to produce much tighter clusters than KMeans can achieve. Here is an example.
# Run Affinity Propogation Experiment
af = AffinityPropagation(preference=20).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
print('Estimated number of clusters: %d' % n_clusters_)
# plt.scatter(X[:, 0], X[:, 1], s=50)
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle
plt.close('all')
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
class_members = labels == k
cluster_center = X[cluster_centers_indices[k]]
plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
for x in X[class_members]:
plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
Try these concepts and see how you get along.

graphing non-linear decision boundary

data can be found here: ex2data2.txt
I'm not sure what call to plt.contour() I should be using to reproduce this.
the related Matlab function call would be:
contour(u, v, z, [0, 0], 'LineWidth', 2)
I'm trying to plot the decision boundary for a non-linear logistic regression like the following image
import scikitplot.plotters as skplt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn import metrics
from ggplot import *
import time
def mapFeature(X1, X2, df=True):
"""
X1, X2: dtype = pd.DataFrame, float, int
either a single value or a vector of values
df : dtype = boolean
whether it's a single scalar value or a vector of values
----------
Return: dtype = m row vector or m x n vector of feature values
Calculates each feature and returns its value
"""
# add a column of ones for intercept parameter
out = pd.DataFrame({'1':np.ones(X1.size)})
# max 6th degree polynomial
for i in range(1,7):
for j in range(i+1):
# all the combinations of polynomials up to 7th degree
value = (X1**(i-j))*(X2**j)
col_name = 'X1**{} * X2**{}'.format(i-j, j)
# When we give a vector with only one dimension, we need to specify
# whether to add it as a column or a row. 0 denotes adding a row,
# and 1 would be a column.
if df:
out = out.join(pd.DataFrame({col_name: value}))
else:
out = out.join(pd.DataFrame({col_name: value}, index=[0]))
return out
if __name__ == '__main__':
data = pd.read_csv('ex2data2.txt', header=None,
names=['Test1', 'Test2', 'Pass'])
X = data.iloc[:, :2]
y = data.iloc[:,2]
X = mapFeature(X.iloc[:,0], X.iloc[:,1])
clf = LogisticRegression().fit(X, y)
theta = clf.coef_
u = np.linspace(start, end, 30)
v = np.linspace(start, end, 30)
uu, vv = np.meshgrid(u, v)
z = np.zeros((30, 30))
for i in range(30):
for j in range(30):
z[i,j] = mapFeature(u[i], v[i], df=False).values.dot(theta.T)
plt.contour(uu, vv, z, [0])
plt.show()

Categories

Resources