I am playing around with a dbscan example in order to see if it will work for me. In my case, I have clusters of a few points (3-5) close together with a fairly long distance in between clusters. I have tried to replicate the situation in the following code. I figured with a low epsilon and low min_samples,this should work, but instead it is telling me that it only sees 1 group (and 20 noise points?). Am I using this incorrectly, or is dbscan not good for this type of problem. I went with dbscan instead of kmeans because I dont know beforehand exactly how many clusters there will be (1-5).
from sklearn.datasets import make_blobs
from sklearn.cluster import DBSCAN
import numpy as np
import matplotlib.pyplot as plt
# Configuration options
num_samples_total = 20
cluster_centers = [(3,3), (7,7),(7,3),(3,7),(5,5)]
num_classes = len(cluster_centers)
#epsilon = 1.0
epsilon = 1e-5
#min_samples = 13
min_samples = 2
# Generate data
X, y = make_blobs(n_samples = num_samples_total, centers = cluster_centers, n_features = num_classes, center_box=(0, 1), cluster_std = 0.05)
np.save('./clusters.npy', X)
X = np.load('./clusters.npy')
# Compute DBSCAN
db = DBSCAN(eps=epsilon, min_samples=min_samples).fit(X)
labels = db.labels_
no_clusters = len(np.unique(labels) )
no_noise = np.sum(np.array(labels) == -1, axis=0)
print('Estimated no. of clusters: %d' % no_clusters)
print('Estimated no. of noise points: %d' % no_noise)
# Generate scatter plot for training data
colors = list(map(lambda x: '#3b4cc0' if x == 1 else '#b40426', labels)) #only set for 2 colors
plt.scatter(X[:,0], X[:,1], c=colors, marker="o", picker=True)
plt.title('Two clusters with data')
plt.xlabel('Axis X[0]')
plt.ylabel('Axis X[1]')
ended up going with kmeans and doing a modified elbow method:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
# Configuration options
num_samples_total = 20
cluster_centers = [(3,3), (7,7),(7,3),(3,7),(5,5)]
num_classes = len(cluster_centers)
#epsilon = 1.0
epsilon = 1e-5
#min_samples = 13
min_samples = 2
# Generate data
X, y = make_blobs(n_samples = num_samples_total, centers = cluster_centers, n_features = num_classes, center_box=(0, 1), cluster_std = 0.05)
random_state = 170
#y_pred = KMeans(n_clusters=5, random_state=random_state).fit_predict(X)
#plt.scatter(X[:, 0], X[:, 1], c=y_pred)
#kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
#maybe I dont have to look for an elbow, just go until the value drops below 1.
#also if I do go too far, it just means that the same shape will be shown twice.
clusterIdx = 0
inertia = 100
while inertia > 1:
clusterIdx = clusterIdx + 1
kmeans = KMeans(n_clusters=clusterIdx, random_state=0).fit(X)
inertia = kmeans.inertia_
plt.scatter(X[:, 0], X[:, 1], c=kmeans.labels_)
im getting bad clusters i would like to rewrite it in a way where i can just plug in any algorithm that i would like (e.g hierarchical, knn, k-means) etc.
#takes in our text_extracts dictionary and returns clusters in an indexed list
def run_clustering(plan):
""" Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
vectorizer = TfidfVectorizer(tokenizer=process_text,
#set the model with the vectorizer which will tokenize with our process_text function
extracts = {}
for page in plan.page_list:
if len(page.text_extract) > 50:
extracts[str(page.document_id) + '_' + str(page.page_number)] = page.text_extract
extract_lst = [extracts[text] for text in extracts]
tfidf_model = vectorizer.fit_transform(extract_lst)
#determine cluster number with silhouette coefficient
#start with 2 as a cluster size in case the set is very small
num_of_clusters_to_test = [2]
#going to test 25 more sizes in equal intervals based on the number of docs we are clustering
intervals_to_test = int(len(extracts) / 25)
num_of_clusters_to_test += [i for i in range(len(extracts)) if i % intervals_to_test == 0 and i != 0]
#these variables will help us determine the max silhouette
#iters_since_new_max is just being held so that if we aren't reaching optimal size for
#four iterations in a row, we dont have to keep testing huge cluster sizes
max_silhouette_coef = 0
iters_since_new_max = 0
good_size = 2
#cluster with a certain cluster size and record the silhouette coefficient
for size in num_of_clusters_to_test:
kmeans = KMeans(n_clusters=size).fit(tfidf_model)
label = kmeans.labels_
sil_coeff = silhouette_score(tfidf_model, label, metric='euclidean')
if sil_coeff > max_silhouette_coef:
max_silhouette_coef = sil_coeff
good_size = size
iters_since_new_max = 0
iters_since_new_max += 1
if iters_since_new_max > 4:
# finally cluster for with the good size we want
km_model = KMeans(n_clusters=good_size)
clustering = collections.defaultdict(list)
for idx, label in enumerate(km_model.labels_):
return clustering
left as much comment as i can to help you all follow what i am going for can anyone help me improve this
You know KMeans if for numeric data only, right. I mean, don't expect it to work on labeled data. With KMeans, you calculate the distance to the nearest centroid (cluster center) and add this point to this cluster. What is the 'distance' between apple, banana, and watermelon? It doesn't make sense! So, just make sure you are running your KMeans over numerics.
import numpy as np
import pandas as pd
from pylab import plot,show
from numpy import vstack,array
from scipy.cluster.vq import kmeans,vq
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
import seaborn as sns
df = pd.read_csv('foo.csv')
# get only numeric fields from your dataframe
df = df.sample(frac=0.1, replace=True, random_state=1)
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
newdf = df.select_dtypes(include=numerics)
for col in newdf.columns:
# your independent variables
X = newdf[['NumericField1','NumericField2','NumericField3','list_price']]
# your dependent variable
y = newdf['DependentVariable']
# take all numeric features from the corr exercise, and turn into an array
# so we can feed it into a cluetering algorythm
data = np.asarray(newdf)
X = data
# computing K-Means with K = 100 (100 clusters)
centroids,_ = kmeans(data,100)
# assign each sample to a cluster
idx,_ = vq(data,centroids)
# some plotting using numpy's logical indexing
details = [(name,cluster) for name, cluster in zip(df.brand,idx)]
for detail in details:
I've found Affinity Propogation to produce much tighter clusters than KMeans can achieve. Here is an example.
# Run Affinity Propogation Experiment
af = AffinityPropagation(preference=20).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
print('Estimated number of clusters: %d' % n_clusters_)
# plt.scatter(X[:, 0], X[:, 1], s=50)
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
class_members = labels == k
cluster_center = X[cluster_centers_indices[k]]
plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
for x in X[class_members]:
plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
plt.title('Estimated number of clusters: %d' % n_clusters_)
Try these concepts and see how you get along.
I am currently doing a project where my team and I have to pick a dataset and apply some machine learning methods on it (SLR, MLR etc.), hence for me, I am doing logical regression. My dataset is related to the top hit songs on Spotify from 2010-2019, and I want to see how the duration and danceability of a song affects its popularity. Given that the popularity values is numerical, I have converted the popularity value of each song to binary values. Hence, the popularity value of a song will change to "0" if it is below 65, and "1" if it is above the value of 65. I then decided to plot a 2d logistic regression plot for two dimensions. The end result is that both the "0" and "1" values are all gathered in the same area, where they are supposed to be separated from each other and there should be a decision boundary at .5 showing. I just want to know what does this show about the relationship between the popularity of the songs and their duration and danceability respectively. Is this supposed to be normal or did i make a mistake?
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
df = pd.read_csv('top10s [SubtitleTools.com] (2).csv')
BPM = df.bpm
BPM = np.array(BPM)
Energy = df.nrgy
Energy = np.array(Energy)
dB = df.dB
dB = np.array(dB)
Live = df.live
Live = np.array(Live)
Valence = df.val
Valence = np.array(Valence)
Acous = df.acous
Acous = np.array(Acous)
Speech = df.spch
Speech = np.array(Speech)
def LogReg0732():
Dur = df.dur
Dur = np.array(Dur)
Dance = df.dnce
Dance = np.array(Dance)
Pop = df.popu
df.loc[df['popu'] <= 65, 'popu'] = 0
df.loc[df['popu'] > 65, 'popu'] = 1
Pop = np.array(Pop)
X = Dur
X = np.stack((X, Dance))
y = Pop
clf = LogisticRegression().fit(X.T, y)
print("Coef ", clf.intercept_, clf.coef_)
xx, yy = np.mgrid[np.min(Dur):np.max(Dur), np.min(Dance):np.max(Dance)]
gridxy = np.c_[xx.ravel(), yy.ravel()]
probs = clf.predict_proba(gridxy)[:,1].reshape(xx.shape)
f, ax = plt.subplots(figsize=(20,8))
contour = ax.contourf(xx, yy, probs, 25, cmap="BrBG", vmin=0, vmax=1)
ax_c = f.colorbar(contour)
ax_c.set_ticks([0, 1/4, 1/2, 3/4, 1])
idx = np.where(y==1); idx = np.reshape(idx,np.shape(idx)[1])
y1 = X[:,idx]
idx = np.where(y==0); idx = np.reshape(idx,np.shape(idx)[1])
y0 = X[:,idx]
ax.scatter(y1[0,:], y1[1,:], c='green')
ax.scatter(y0[0,:], y0[1,:], c='blue')
So the main reason, assuming the there is no mistake in the model building process, is that durability and dancibilility are not good features for your problem. You likely have to add more features.
To understand the model in detail you would have to run a variety of statistical test, but I think in short they will all result in the same answer, that this isn't a good fit.
1)If you dont want to add more features, you can also try changing the Cutoff value from 65 to something else, it might help.
2)Try normalizing your data.
I have two numpy arrays: dataX and dataY, and I am trying to filter each array to reduce the noise. The image shown below shows the actual input data (blue dots) and an example of what I want it to be like(red dots). I do not need the filtered data to be as perfect as in the example but I do want it to be as straight as possible. I have provided sample data in the code.
What I have tried:
Firstly, you can see that the data isn't 'continuous', so I first divided them into individual 'segments' ( 4 of them in this example), and then applied a filter to each 'segment'. Someone suggested that I use a Savitzky-Golay filter. The full, run-able code is below:
import scipy as sc
import scipy.signal
import numpy as np
import matplotlib.pyplot as plt
# Sample Data
ydata = np.array([1,0,1,2,1,2,1,0,1,1,2,2,0,0,1,0,1,0,1,2,7,6,8,6,8,6,6,8,6,6,8,6,6,7,6,5,5,6,6, 10,11,12,13,12,11,10,10,11,10,12,11,10,10,10,10,12,12,10,10,17,16,15,17,16, 17,16,18,19,18,17,16,16,16,16,16,15,16])
xdata = np.array([1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32,33, 1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32])
# Used a diff array to find where there is a big change in Y.
# If there's a big change in Y, then there must be a change of 'segment'.
diffy = np.diff(ydata)
# Create empty numpy arrays to append values into
filteredX = np.array([])
filteredY = np.array([])
# Chose 3 to be the value indicating the change in Y
index = np.where(diffy >3)
# Loop through the array
start = 0
for i in range (0, (index[0].size +1) ):
# Check if last segment is reached
if i == index[0].size:
print xdata[start:]
partSize = xdata[start:].size
# Window length must be an odd integer
if partSize % 2 == 0:
partSize = partSize - 1
filteredDataX = sc.signal.savgol_filter(xdata[start:], partSize, 3)
filteredDataY = sc.signal.savgol_filter(ydata[start:], partSize, 3)
filteredX = np.append(filteredX, filteredDataX)
filteredY = np.append(filteredY, filteredDataY)
print xdata[start:index[0][i]]
partSize = xdata[start:index[0][i]].size
if partSize % 2 == 0:
partSize = partSize - 1
filteredDataX = sc.signal.savgol_filter(xdata[start:index[0][i]], partSize, 3)
filteredDataY = sc.signal.savgol_filter(ydata[start:index[0][i]], partSize, 3)
start = index[0][i]
filteredX = np.append(filteredX, filteredDataX)
filteredY = np.append(filteredY, filteredDataY)
# Plots
plt.plot(xdata,ydata, 'bo', label = 'Input Data')
plt.plot(filteredX, filteredY, 'ro', label = 'Filtered Data')
This is my result:
When each point is connected, the result looks as follows.
I have played around with the order, but it seems like a third order gave the best result.
I have also tried these filters, among a few others:
But so far none of the filters I have tried were close to what I really wanted. What is the best way to filter data such as this? Looking forward to your help.
One way to get something looking close to your ideal would be clustering + linear regression.
Note that you have to provide the number of clusters and I also cheated a bit in scaling up y before clustering.
import numpy as np
from scipy import cluster, stats
ydata = np.array([1,0,1,2,1,2,1,0,1,1,2,2,0,0,1,0,1,0,1,2,7,6,8,6,8,6,6,8,6,6,8,6,6,7,6,5,5,6,6, 10,11,12,13,12,11,10,10,11,10,12,11,10,10,10,10,12,12,10,10,17,16,15,17,16, 17,16,18,19,18,17,16,16,16,16,16,15,16])
xdata = np.array([1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32,33, 1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32])
def split_to_lines(x, y, k):
yo = np.empty_like(y, dtype=float)
# get the cluster centers and the labels for each point
centers, map_ = cluster.vq.kmeans2(np.array((x, y * 2)).T.astype(float), k)
# for each cluster, use the labels to select the points belonging to
# the cluster and do a linear regression
for i in range(k):
slope, interc, *_ = stats.linregress(x[map_==i], y[map_==i])
# use the regression parameters to construct y values on the
# best fit line
yo[map_==i] = x[map_==i] * slope + interc
return yo
import pylab
pylab.plot(xdata, ydata, 'or')
pylab.plot(xdata, split_to_lines(xdata, ydata, 4), 'ob')
I am new to the LDA and I have three questions. I would like to classify my text (tags) with the LDA. First I filter the words, which have been used only by one user, machine tags, tags containing only digits and tags with the frequency less than 3.
Then, I calculate the amount of topics with the Elbow method and there I get the memory error (this will be the third question). So the amount of topics suggested by the Elbow method is 8 (I have filtered some more tags to overcome the memory issue but I would need to apply it to bigger datasets in the future).
Should I use tf-idf as a preprocessing step for the LDA? Or if I filter the "useless" tags before it doesn't make sense? I think I don't understand what is going on exactly in the LDA.
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lda = ldamodel.LdaModel(corpus_tfidf, id2word=dictionary, alpha = 0.1, num_topics=8)
corpus_lda = lda[corpus_tfidf]
Does it make sense to validate the topics quality with the LSI? As I understand the LSI is a method for dimensionality reduction, so I use it to apply K-Means and to see if the 8 clusters of the topics actually look like clusters. But to be honest I don't really understand what exactly I am visualising.
lsi = models.LsiModel(corpus_lda, id2word=dictionary, num_topics=2)
lsi_coord = "file.csv"
fcoords = codecs.open(lsi_coord,'w','utf-8')
for vector in lsi[corpus_lda]:
if len(vector) != 2:
fcoords.writelines("%6.12f\t%6.12f\n" % (vector[0][1],vector[1][1]))
num_topics = 8
X = np.loadtxt(lsi_coord, delimiter="\t")
my_kmeans = KMeans(num_topics).fit(X)
k_means_labels = my_kmeans.labels_
k_means_cluster_centers = my_kmeans.cluster_centers_
colors = ['b','g','r','c','m','y','k','greenyellow']
for k, col in zip(range(num_topics), colors):
my_members = k_means_labels == k
plt.scatter(X[my_members, 0], X[my_members, 1], s=30, c=colors[k], zorder=10)
cluster_center = k_means_cluster_centers[k]
plt.scatter(cluster_center[0], cluster_center[1], marker='x', s=30, linewidths=3, color='r', zorder=10)
plt.title('K-means clustering')
Memory issues. I am trying to create a matrix which has values for every unique term. So if the term is not in the document it gets zero. So it is a sparse matrix, because I have around 1300 unique terms and every document has about 5. And the memory issue arise at the converting to np.array. I guess I have to optimize the matrix somehow.
# creating term-by-document matrix
Y = []
for z in corpus_lda:
for g in z:
while counter < len(dictionary.keys()):
if counter in temp_dict.keys():
Y = np.array(Y)
The following code I took from here : Calculating the percentage of variance measure for k-means?
K = range(1,30) # amount of clusters
KM = [kmeans(Y,k) for k in K]
KM = []
for k in K:
KM_result = kmeans(Y,k)
centroids = [cent for (cent,var) in KM]
scipy.spatial.distance import cdist
D_k = [cdist(Y, cent, 'euclidean') for cent in centroids]
cIdx = [np.argmin(D,axis=1) for D in D_k]
dist = [np.min(D,axis=1) for D in D_k]
avgWithinSS = [sum(d)/Y.shape[0] for d in dist]
kIdx = 8
# elbow curve
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(K, avgWithinSS, 'b*-')
ax.plot(K[kIdx], avgWithinSS[kIdx], marker='o', markersize=12, markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
plt.xlabel('Number of clusters')
plt.ylabel('Average within-cluster sum of squares')
plt.title('Elbow for KMeans clustering')
Any ideas for any of the questions are highly appreciated!