my algorithm gives bad clusters while usingTF-IDF - python
im getting bad clusters i would like to rewrite it in a way where i can just plug in any algorithm that i would like (e.g hierarchical, knn, k-means) etc.
#takes in our text_extracts dictionary and returns clusters in an indexed list
def run_clustering(plan):
""" Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
vectorizer = TfidfVectorizer(tokenizer=process_text,
max_df=0.5,
min_df=0.005,
ngram_range=(1,4),
lowercase=True)
#set the model with the vectorizer which will tokenize with our process_text function
extracts = {}
for page in plan.page_list:
if len(page.text_extract) > 50:
extracts[str(page.document_id) + '_' + str(page.page_number)] = page.text_extract
extract_lst = [extracts[text] for text in extracts]
tfidf_model = vectorizer.fit_transform(extract_lst)
#determine cluster number with silhouette coefficient
#start with 2 as a cluster size in case the set is very small
num_of_clusters_to_test = [2]
#going to test 25 more sizes in equal intervals based on the number of docs we are clustering
intervals_to_test = int(len(extracts) / 25)
#print(intervals_to_test)
num_of_clusters_to_test += [i for i in range(len(extracts)) if i % intervals_to_test == 0 and i != 0]
#these variables will help us determine the max silhouette
#iters_since_new_max is just being held so that if we aren't reaching optimal size for
#four iterations in a row, we dont have to keep testing huge cluster sizes
max_silhouette_coef = 0
iters_since_new_max = 0
good_size = 2
#cluster with a certain cluster size and record the silhouette coefficient
for size in num_of_clusters_to_test:
kmeans = KMeans(n_clusters=size).fit(tfidf_model)
label = kmeans.labels_
sil_coeff = silhouette_score(tfidf_model, label, metric='euclidean')
if sil_coeff > max_silhouette_coef:
max_silhouette_coef = sil_coeff
good_size = size
iters_since_new_max = 0
else:
iters_since_new_max += 1
if iters_since_new_max > 4:
break
# finally cluster for with the good size we want
km_model = KMeans(n_clusters=good_size)
km_model.fit(tfidf_model)
clustering = collections.defaultdict(list)
for idx, label in enumerate(km_model.labels_):
clustering[label].append(idx)
return clustering
left as much comment as i can to help you all follow what i am going for can anyone help me improve this
You know KMeans if for numeric data only, right. I mean, don't expect it to work on labeled data. With KMeans, you calculate the distance to the nearest centroid (cluster center) and add this point to this cluster. What is the 'distance' between apple, banana, and watermelon? It doesn't make sense! So, just make sure you are running your KMeans over numerics.
import numpy as np
import pandas as pd
from pylab import plot,show
from numpy import vstack,array
from scipy.cluster.vq import kmeans,vq
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
import seaborn as sns
df = pd.read_csv('foo.csv')
# get only numeric fields from your dataframe
df = df.sample(frac=0.1, replace=True, random_state=1)
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
newdf = df.select_dtypes(include=numerics)
for col in newdf.columns:
print(col)
# your independent variables
X = newdf[['NumericField1','NumericField2','NumericField3','list_price']]
# your dependent variable
y = newdf['DependentVariable']
# take all numeric features from the corr exercise, and turn into an array
# so we can feed it into a cluetering algorythm
data = np.asarray(newdf)
X = data
# computing K-Means with K = 100 (100 clusters)
centroids,_ = kmeans(data,100)
# assign each sample to a cluster
idx,_ = vq(data,centroids)
# some plotting using numpy's logical indexing
plot(data[idx==0,0],data[idx==0,1],'ob',
data[idx==1,0],data[idx==1,1],'oy',
data[idx==2,0],data[idx==2,1],'or',
data[idx==3,0],data[idx==3,1],'og',
data[idx==4,0],data[idx==4,1],'om')
plot(centroids[:,0],centroids[:,1],'sg',markersize=8)
show()
details = [(name,cluster) for name, cluster in zip(df.brand,idx)]
for detail in details:
print(detail)
I've found Affinity Propogation to produce much tighter clusters than KMeans can achieve. Here is an example.
# Run Affinity Propogation Experiment
af = AffinityPropagation(preference=20).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
print('Estimated number of clusters: %d' % n_clusters_)
# plt.scatter(X[:, 0], X[:, 1], s=50)
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle
plt.close('all')
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
class_members = labels == k
cluster_center = X[cluster_centers_indices[k]]
plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
for x in X[class_members]:
plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
Try these concepts and see how you get along.
Related
how to get automaticaly the number of colors and the colors returned from a dendrogram clustering hierarchy?
scikit-learn gives an example of python code to generate a dendogram. I copy/paste this code bellow. This code generates a dendogram. This dendogram display 3 differents colors: blue, green, and orange. Question: which code associated with this dendogram code example, could automaticaly deliver: the number of colors generated by the dendrogram ? the list of those of those colors (or their code number) ? import numpy as np from matplotlib import pyplot as plt from scipy.cluster.hierarchy import dendrogram from sklearn.datasets import load_iris from sklearn.cluster import AgglomerativeClustering def plot_dendrogram(model, **kwargs): # Create linkage matrix and then plot the dendrogram # create the counts of samples under each node counts = np.zeros(model.children_.shape[0]) n_samples = len(model.labels_) for i, merge in enumerate(model.children_): current_count = 0 for child_idx in merge: if child_idx < n_samples: current_count += 1 # leaf node else: current_count += counts[child_idx - n_samples] counts[i] = current_count linkage_matrix = np.column_stack( [model.children_, model.distances_, counts] ).astype(float) # Plot the corresponding dendrogram dendrogram(linkage_matrix, **kwargs) iris = load_iris() X = iris.data # setting distance_threshold=0 ensures we compute the full tree. model = AgglomerativeClustering(distance_threshold=0, n_clusters=None) model = model.fit(X) plt.title("Hierarchical Clustering Dendrogram") # plot the top three levels of the dendrogram plot_dendrogram(model, truncate_mode="level", p=3) plt.xlabel("Number of points in node (or index of point if no parenthesis).") plt.show()
If you read the documentation here, the number of colors is determined by color_threshold, which is defaulted to 0.7*max(Z[:,2]). So you only have to find the number of merges higher than that: First modify your code to get the linkage matrix: def get_linkage(model): # Create linkage matrix # create the counts of samples under each node counts = np.zeros(model.children_.shape[0]) n_samples = len(model.labels_) for i, merge in enumerate(model.children_): current_count = 0 for child_idx in merge: if child_idx < n_samples: current_count += 1 # leaf node else: current_count += counts[child_idx - n_samples] counts[i] = current_count linkage_matrix = np.column_stack( [model.children_, model.distances_, counts] ).astype(float) return linkage_matrix iris = load_iris() X = iris.data # setting distance_threshold=0 ensures we compute the full tree. model = AgglomerativeClustering(distance_threshold=0, n_clusters=None) model = model.fit(X) linkage_matrix = get_linkage(model) Then calculate the number of colors from it: from scipy.cluster.hierarchy import cut_tree color_threshold = 0.7 * max(linkage_matrix[:, 2]) n_color = 1 + len(np.unique(cut_tree(linkage_matrix, height = color_threshold))) color_codes = ['C' + str(i) for i in range(n_color)] # this is simply the matplotlib default color code
dbscan not making sense for small amounts of points
I am playing around with a dbscan example in order to see if it will work for me. In my case, I have clusters of a few points (3-5) close together with a fairly long distance in between clusters. I have tried to replicate the situation in the following code. I figured with a low epsilon and low min_samples,this should work, but instead it is telling me that it only sees 1 group (and 20 noise points?). Am I using this incorrectly, or is dbscan not good for this type of problem. I went with dbscan instead of kmeans because I dont know beforehand exactly how many clusters there will be (1-5). from sklearn.datasets import make_blobs from sklearn.cluster import DBSCAN import numpy as np import matplotlib.pyplot as plt # Configuration options num_samples_total = 20 cluster_centers = [(3,3), (7,7),(7,3),(3,7),(5,5)] num_classes = len(cluster_centers) #epsilon = 1.0 epsilon = 1e-5 #min_samples = 13 min_samples = 2 # Generate data X, y = make_blobs(n_samples = num_samples_total, centers = cluster_centers, n_features = num_classes, center_box=(0, 1), cluster_std = 0.05) np.save('./clusters.npy', X) X = np.load('./clusters.npy') # Compute DBSCAN db = DBSCAN(eps=epsilon, min_samples=min_samples).fit(X) labels = db.labels_ no_clusters = len(np.unique(labels) ) no_noise = np.sum(np.array(labels) == -1, axis=0) print('Estimated no. of clusters: %d' % no_clusters) print('Estimated no. of noise points: %d' % no_noise) # Generate scatter plot for training data colors = list(map(lambda x: '#3b4cc0' if x == 1 else '#b40426', labels)) #only set for 2 colors plt.scatter(X[:,0], X[:,1], c=colors, marker="o", picker=True) plt.title('Two clusters with data') plt.xlabel('Axis X[0]') plt.ylabel('Axis X[1]') plt.show()
ended up going with kmeans and doing a modified elbow method: print(__doc__) # Author: Phil Roth <mr.phil.roth#gmail.com> # License: BSD 3 clause import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn.datasets import make_blobs # Configuration options num_samples_total = 20 cluster_centers = [(3,3), (7,7),(7,3),(3,7),(5,5)] num_classes = len(cluster_centers) #epsilon = 1.0 epsilon = 1e-5 #min_samples = 13 min_samples = 2 # Generate data X, y = make_blobs(n_samples = num_samples_total, centers = cluster_centers, n_features = num_classes, center_box=(0, 1), cluster_std = 0.05) random_state = 170 #y_pred = KMeans(n_clusters=5, random_state=random_state).fit_predict(X) #plt.scatter(X[:, 0], X[:, 1], c=y_pred) #kmeans = KMeans(n_clusters=2, random_state=0).fit(X) #maybe I dont have to look for an elbow, just go until the value drops below 1. #also if I do go too far, it just means that the same shape will be shown twice. clusterIdx = 0 inertia = 100 while inertia > 1: clusterIdx = clusterIdx + 1 kmeans = KMeans(n_clusters=clusterIdx, random_state=0).fit(X) inertia = kmeans.inertia_ print(inertia) plt.scatter(X[:, 0], X[:, 1], c=kmeans.labels_) print(clusterIdx) plt.show()
Sklearn's Affinity Propagation, good dependents bad exemplars
I am attempting to use sklearn's affinity propagation implementation for a fairly easy cluster, however, I am getting some funky results. I was trying to use AP with 300 clusters of 3 points each and it failed miserably so I tried a seemingly easy clustering problem of 5 gaussian distributed clusters with 100 points each. The resulting graph is linked below. Does anyone know where I went wrong? AP plot I followed #Anony-Mousse's response from this, however, increasing the damping and max iterations did not really help. import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import AffinityPropagation from itertools import cycle n_per_cluster = 100 n_clusters = 5 n_total = n_per_cluster*n_clusters x = np.empty(n_total) y = np.empty(n_total) labels = np.empty(n_total) count = 0 for i in range(n_clusters): xseed = np.random.random()*100 yseed = np.random.random()*100 normX = np.random.normal(xseed,1,n_per_cluster) normY = np.random.normal(yseed,1,n_per_cluster) normCount = 0 for j in range(n_per_cluster): x[count] = normX[normCount] y[count] = normY[normCount] labels[count] = i normCount+=1 count+=1 #print(labels) #print(x, y) # plt.scatter(x,y) # plt.show() preference = -50 max_iter = 1000 xy = np.column_stack((x,y)) af = AffinityPropagation(damping = 0.9, preference = preference, verbose = True, max_iter = max_iter).fit(xy) _exemplars_index = af.cluster_centers_indices_ _labels = af.labels_ _n_cluster = len(_exemplars_index) plt.close('all') plt.figure(1) plt.clf() colors = cycle('bgrcmyk') for k,col in zip(range(_n_cluster),colors): class_members = labels == k #error check exemplars = xy[_exemplars_index[k]] plt.plot(xy[class_members, 0], xy[class_members,1], col + '.') plt.plot(exemplars[0], exemplars[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) for x in xy[class_members]: plt.plot([exemplars[0], x[0]], [exemplars[1], x[1]], col) plt.title('Estimated number of clusters: %d' % _n_cluster) plt.show() It is getting the clusters correct, but the exemplars are across the screen. This is a pretty straight forward clustering problem so I imagine it is user error but I haven't figured it out yet. Thanks for the help
Sorry, I should have tried a parametric sweep before posting. Apparently, AP is just highly sensitive to metric data. I got decent results at preference = -100, damping =0.95, and 1500 iterations.
Filtering 1D numpy arrays in Python
Explanation: I have two numpy arrays: dataX and dataY, and I am trying to filter each array to reduce the noise. The image shown below shows the actual input data (blue dots) and an example of what I want it to be like(red dots). I do not need the filtered data to be as perfect as in the example but I do want it to be as straight as possible. I have provided sample data in the code. What I have tried: Firstly, you can see that the data isn't 'continuous', so I first divided them into individual 'segments' ( 4 of them in this example), and then applied a filter to each 'segment'. Someone suggested that I use a Savitzky-Golay filter. The full, run-able code is below: import scipy as sc import scipy.signal import numpy as np import matplotlib.pyplot as plt # Sample Data ydata = np.array([1,0,1,2,1,2,1,0,1,1,2,2,0,0,1,0,1,0,1,2,7,6,8,6,8,6,6,8,6,6,8,6,6,7,6,5,5,6,6, 10,11,12,13,12,11,10,10,11,10,12,11,10,10,10,10,12,12,10,10,17,16,15,17,16, 17,16,18,19,18,17,16,16,16,16,16,15,16]) xdata = np.array([1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32,33, 1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32]) # Used a diff array to find where there is a big change in Y. # If there's a big change in Y, then there must be a change of 'segment'. diffy = np.diff(ydata) # Create empty numpy arrays to append values into filteredX = np.array([]) filteredY = np.array([]) # Chose 3 to be the value indicating the change in Y index = np.where(diffy >3) # Loop through the array start = 0 for i in range (0, (index[0].size +1) ): # Check if last segment is reached if i == index[0].size: print xdata[start:] partSize = xdata[start:].size # Window length must be an odd integer if partSize % 2 == 0: partSize = partSize - 1 filteredDataX = sc.signal.savgol_filter(xdata[start:], partSize, 3) filteredDataY = sc.signal.savgol_filter(ydata[start:], partSize, 3) filteredX = np.append(filteredX, filteredDataX) filteredY = np.append(filteredY, filteredDataY) else: print xdata[start:index[0][i]] partSize = xdata[start:index[0][i]].size if partSize % 2 == 0: partSize = partSize - 1 filteredDataX = sc.signal.savgol_filter(xdata[start:index[0][i]], partSize, 3) filteredDataY = sc.signal.savgol_filter(ydata[start:index[0][i]], partSize, 3) start = index[0][i] filteredX = np.append(filteredX, filteredDataX) filteredY = np.append(filteredY, filteredDataY) # Plots plt.plot(xdata,ydata, 'bo', label = 'Input Data') plt.plot(filteredX, filteredY, 'ro', label = 'Filtered Data') plt.xlabel('X') plt.ylabel('Y') plt.title('Result') plt.legend() plt.show() This is my result: When each point is connected, the result looks as follows. I have played around with the order, but it seems like a third order gave the best result. I have also tried these filters, among a few others: scipy.signal.medfilt scipy.ndimage.filters.uniform_filter1d But so far none of the filters I have tried were close to what I really wanted. What is the best way to filter data such as this? Looking forward to your help.
One way to get something looking close to your ideal would be clustering + linear regression. Note that you have to provide the number of clusters and I also cheated a bit in scaling up y before clustering. import numpy as np from scipy import cluster, stats ydata = np.array([1,0,1,2,1,2,1,0,1,1,2,2,0,0,1,0,1,0,1,2,7,6,8,6,8,6,6,8,6,6,8,6,6,7,6,5,5,6,6, 10,11,12,13,12,11,10,10,11,10,12,11,10,10,10,10,12,12,10,10,17,16,15,17,16, 17,16,18,19,18,17,16,16,16,16,16,15,16]) xdata = np.array([1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32,33, 1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32]) def split_to_lines(x, y, k): yo = np.empty_like(y, dtype=float) # get the cluster centers and the labels for each point centers, map_ = cluster.vq.kmeans2(np.array((x, y * 2)).T.astype(float), k) # for each cluster, use the labels to select the points belonging to # the cluster and do a linear regression for i in range(k): slope, interc, *_ = stats.linregress(x[map_==i], y[map_==i]) # use the regression parameters to construct y values on the # best fit line yo[map_==i] = x[map_==i] * slope + interc return yo import pylab pylab.plot(xdata, ydata, 'or') pylab.plot(xdata, split_to_lines(xdata, ydata, 4), 'ob') pylab.show()
Equivalent of Matlab's cluster quality function?
MATLAB has a nice silhouette function to help evaluate the number of clusters for k-means. Is there an equivalent for Python's Numpy/Scipy as well?
I present below a sample silhouette implementation in both MATLAB and Python/Numpy (keep in mind that I am more fluent in MATLAB): 1) MATLAB function s = mySilhouette(X, IDX) %# X : matrix of size N-by-p, data where rows are instances %# IDX: vector of size N, cluster index of each instance (starting from 1) %# s : vector of size N, silhouette score value of each instance N = size(X,1); %# number of instances K = numel(unique(IDX)); %# number of clusters %# compute pairwise distance matrix D = squareform( pdist(X,'euclidean').^2 ); %# indices belonging to each cluster kIndices = accumarray(IDX, 1:N, [K 1], #(x){sort(x)}); %# compute a,b,s for each instance %# a(i): average distance from i to all other data within the same cluster. %# b(i): lowest average dist from i to the data of another single cluster a = zeros(N,1); b = zeros(N,1); for i=1:N ind = kIndices{IDX(i)}; ind = ind(ind~=i); a(i) = mean( D(i,ind) ); b(i) = min( cellfun(#(ind) mean(D(i,ind)), kIndices([1:K]~=IDX(i))) ); end s = (b-a) ./ max(a,b); end To emulate the plot from the silhouette function in MATLAB, we group the silhouette values by cluster, sort within each, then plot the bars horizontally. MATLAB adds NaNs to separate the bars from the different clusters, I found it easier to simply color-code the bars: %# sample data load fisheriris X = meas; N = size(X,1); %# cluster and compute silhouette score K = 3; [IDX,C] = kmeans(X, K, 'distance','sqEuclidean'); s = mySilhouette(X, IDX); %# plot [~,ord] = sortrows([IDX s],[1 -2]); indices = accumarray(IDX(ord), 1:N, [K 1], #(x){sort(x)}); ytick = cellfun(#(ind) (min(ind)+max(ind))/2, indices); ytickLabels = num2str((1:K)','%d'); %#' h = barh(1:N, s(ord),'hist'); set(h, 'EdgeColor','none', 'CData',IDX(ord)) set(gca, 'CLim',[1 K], 'CLimMode','manual') set(gca, 'YDir','reverse', 'YTick',ytick, 'YTickLabel',ytickLabels) xlabel('Silhouette Value'), ylabel('Cluster') %# compare against SILHOUETTE figure, silhouette(X,IDX) 2) Python And here is what I came up with in Python: import numpy as np from scipy.cluster.vq import kmeans2 from scipy.spatial.distance import pdist, squareform from sklearn import datasets import matplotlib.pyplot as plt from matplotlib import cm def silhouette(X, cIDX): """ Computes the silhouette score for each instance of a clustered dataset, which is defined as: s(i) = (b(i)-a(i)) / max{a(i),b(i)} with: -1 <= s(i) <= 1 Args: X : A M-by-N array of M observations in N dimensions cIDX : array of len M containing cluster indices (starting from zero) Returns: s : silhouette value of each observation """ N = X.shape[0] # number of instances K = len(np.unique(cIDX)) # number of clusters # compute pairwise distance matrix D = squareform(pdist(X)) # indices belonging to each cluster kIndices = [np.flatnonzero(cIDX==k) for k in range(K)] # compute a,b,s for each instance a = np.zeros(N) b = np.zeros(N) for i in range(N): # instances in same cluster other than instance itself a[i] = np.mean( [D[i][ind] for ind in kIndices[cIDX[i]] if ind!=i] ) # instances in other clusters, one cluster at a time b[i] = np.min( [np.mean(D[i][ind]) for k,ind in enumerate(kIndices) if cIDX[i]!=k] ) s = (b-a)/np.maximum(a,b) return s def main(): # load Iris dataset data = datasets.load_iris() X = data['data'] # cluster and compute silhouette score K = 3 C, cIDX = kmeans2(X, K) s = silhouette(X, cIDX) # plot order = np.lexsort((-s,cIDX)) indices = [np.flatnonzero(cIDX[order]==k) for k in range(K)] ytick = [(np.max(ind)+np.min(ind))/2 for ind in indices] ytickLabels = ["%d" % x for x in range(K)] cmap = cm.jet( np.linspace(0,1,K) ).tolist() clr = [cmap[i] for i in cIDX[order]] fig = plt.figure() ax = fig.add_subplot(111) ax.barh(range(X.shape[0]), s[order], height=1.0, edgecolor='none', color=clr) ax.set_ylim(ax.get_ylim()[::-1]) plt.yticks(ytick, ytickLabels) plt.xlabel('Silhouette Value') plt.ylabel('Cluster') plt.show() if __name__ == '__main__': main() Update: As noted by others, scikit-learn has since then added its own silhouette metric implementation. To use it in the above code, replace the call to the custom-defined silhouette function with: from sklearn.metrics import silhouette_samples ... #s = silhouette(X, cIDX) s = silhouette_samples(X, cIDX) # <-- scikit-learn function ... the rest of the code can still be used as-is to generate the exact same plot.
I've looked, but I can't find a numpy/scipy silhouette function, I even looked in pylab and matplotlib. I think you'll have to implement it yourself. I can point you to http://orange.biolab.si/trac/browser/trunk/orange/orngClustering.py?rev=7462. It has a few functions which implement a silhouette function. Hope this helps.
This is a little late, but for what it is worth, it appears that scikits-learn now implements a silhouette function. See their documentation page or view the source code directly.