Scikit learn: Why is my k-means clustering scatter plot so ugly? - python

def get_nouns (text):
tagger = MeCab.Tagger()
words = []
for c in tagger.parse(text).splitlines()[:-1]:
if len(c.split('\t')) < 2:
continue
surface, feature = c.split('\t')
pos = feature.split(',')[0]
if pos == '名詞': # noun
words.append(surface)
return ' '.join(words)
def bio():
biolist =[]
howmany = 10
for giin in read:
if len(giin["education"]) < 1:
continue
biolist.append(get_nouns(" ".join(giin["education"])))
######################################################
nparray = np.array (biolist)
cv = CountVectorizer()
bags = cv.fit_transform(nparray)
tfidf=TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(bags)
km_model = KMeans(n_clusters=howmany, init='k-means++')
km_model.fit_transform(tfidf)
lsa2 = TruncatedSVD(2)
compressed_text_list = lsa2.fit_transform(tfidf)
compressed_center_list = lsa2.fit_transform(km_model.cluster_centers_)
X = []
Y = []
X_cent = []
Y_cent = []
for x, y in compressed_text_list:
X.append(x)
Y.append(y)
for x, y in compressed_center_list:
X_cent.append(x)
Y_cent.append(y)
clus_list = []
for i in range (howmany):
clus_list.append([])
for a in biolist:
if km_model.labels_[biolist.index(a)] == i:
clus_list[i].append(a)
for a in clus_list:
print (a)
print (" ")
plt.scatter(X, Y, c = km_model.labels_)
plt.scatter(X_cent, Y_cent, c="r", marker = "+")
plt.show()
I have this code that clusters educational history of people into 10 groups. My scatter plot looks like this.
As you can see, the scatter plot is not really sorted into groups and different colors are mixed up with one another. How could this code be changed to make the grouping more precise?

Related

Using Scipy.optimize.curve_fit to fit an exponential

def Data_to_array(file):
r = int
x, y=[],[]
data = []
line_num = 0
#call data
P = open(file,'r')
data = P.readlines()
#Get it to ignore strings
for line in data:
line_num += 1
if line.find("[data]") >= 0:
r = (line_num+1)
# Data = P.readlines()[:r]
# print (Data)
if "Sampling Rate" in line:
SR = float(line[15:])
if "temperature=" in line:
T = float(line[12:18])
print(str("Temperature = "))
print(T)
Data = data[r:line_num]
#assign data into dataframe
df = pd.DataFrame(Data)
#rename column in data
df = df.rename(columns = {0: 'volts'})
#get it to recognise the index
df.index.name = 'Index'
#get it to recognise the data as number
df = df.astype({'volts': float})
#get index to start at 1
df.index += 1
#assign data to lists
I = df.index.to_list()
t = df['volts'].to_list()
#get it to invert data
y = [element * -1 for element in t]
#multiply by sampling rate
x = [element /(SR) for element in I]
return x,y
#This is to create the exponential function
def Exponential_func(file):
temp_array = Data_to_array(file)
X = np.asarray(temp_array[0])
a,b = float()
#Y = temp_array[1]
f = np.exp(a*X) + b
return f
#This is to get the optomize function to work
def Exponential_model(file):
temp_array = Data_to_array(file)
X = np.asarray(temp_array[0])
Y = np.asarray(temp_array[1])
#f = np.exp(X)
#exp_mod = lf.ExponentialModel(X,Y)
#pars = exp_mod.guess(Y, X)
r = sp.optimize.curve_fit(X,Y,Exponential_func.f)
return r
#This is to plot the data
def Plot_Data (file):
temp_array = Data_to_array(file)
X = np.asarray(temp_array[0])
Y = np.asarray(temp_array[1])
#p_0 = np.exp(X)
#sp.optimize.curve_fit(X,Y,p_0)
plt.scatter(X,Y)
#plt.plot(Exponential_model.r)
plt.show()
plt.xlabel("Time (s)")
plt.ylabel("Capacitence (μF)")
# print(Data_to_array('Cz-Si-T-1.txt')[1])
Plot_Data("Cz-Si-T-82.txt")
Exponential_func("Cz-Si-T-82.txt")
Exponential_model("Cz-Si-T-82.txt")
When I try to use the sp.optomize function, I get the error "'function' object has no attribute 'f'" but when looking up this problem I have the function and variables in the correct order.
I need this piece of code to fit an exponential curve to my data, which does have an exp fit, can anyone help? It would also be helpful for the code to print the function of the fitted curve as I will be integrating under this later.

How can I align multiple trajectories in python using dtw?

I have 5 different trajectories for my project. First I read them from the file, then save them to a list by parsing the file. After that I digitized these values. I want to align x and y coordinates separately and plot x and y coordinates together on a grid.
This is what I have done so far. I use dtw package in python, but it takes two lists and gives the path as an array. How can I convert it to an aligned trajectory of 5 different lists?
x_dict = {}
y_dict = {}
for x in ["1", "2", "3", "4", "5"]:
file = open("data-" + x + ".txt", encoding="latin-1")
data = file.read()
pos_list = re.findall(r'position:(.*?)orientation:', data, re.DOTALL)
or_list = re.findall(r'orientation:(.*?)scale:', data, re.DOTALL)
for i in range(len(pos_list)):
pos_list[i] = pos_list[i].replace('\n','')
regexx = re.compile(r'x: (.*?) y:')
regexy = re.compile(r'y: (.*?) z:')
posx_list = [m.group(1) for l in pos_list for m in [regexx.search(l)] if m]
posx_list = list(map(float, posx_list))
posy_list = [m.group(1) for l in pos_list for m in [regexy.search(l)] if m]
posy_list = list(map(float, posy_list))
bins = numpy.linspace(-1, 1, 100)
digitized_x = numpy.digitize(posx_list, bins)
digitized_y = numpy.digitize(posy_list, bins)
x_dict[x] = digitized_x
y_dict[x] = digitized_y
dist, cost, acc, path = dtw(y_dict["5"], y_dict["4"], dist= euclidean)
plt.imshow(acc.T, origin='lower', cmap=cm.gray, interpolation='nearest')
plt.plot(path[0], path[1], 'w')
plt.xlim((-0.5, acc.shape[0]-0.5))
plt.ylim((-0.5, acc.shape[1]-0.5))

K-Means clustering multidimensional data with a heatmap

I have been trying to implement k-means clustering with a heatmap, but have been unsuccessful.
Here is the initial data set:
https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv
And here is my code:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import random
#%matplotlib inline
def truncate(f, n):
return math.floor(f * 10 ** n) / 10 ** n
def chooseCenter(data, centers):
length = data.shape
cent = []
while len(cent) < centers :
x = random.randrange(0,length[0])
y = random.randrange(0,length[1])
if data.iloc[x][y] not in cent:
d = truncate(data.iloc[x][y],2)
cent.append(d)
return cent
def distance(val, center):
return math.sqrt((val- center)**2)
def getDistances(centers, data):
length = data.shape
dist = []
for i in range(length[0]):
for j in range(length[1]):
y = []
for k in range(len(centers)):
val = distance(data.iloc[i][j], centers[k])
y.append(truncate(val,3))
dist.append(y)
return dist
def findClosest(data, dist):
close = data.copy()
length = close.shape
indexes = []
for i in range(len(dist)):
pt = min(dist[i])
idx = dist[i].index(pt)
indexes.append(idx)
#print(indexes)
length = data.shape
n = np.array(indexes)
n = pd.DataFrame(np.reshape(n, (length[0],length[1])))
#reshape this data frame into the same shape as the data
#keep running the find closest until there is no change
#try heatmap on this?
#this should cluster it, but to make sure test it
#might need to do some tweaking to this
return n
# for i in range(length[0]):
# for j in range(length[1]):
# print('dist[i]', dist[j])
# pt = min(dist[j])
# print(pt)
# idx = dist[j].index(pt)
# close.iloc[i][j] = int(idx)
#return close
def computeNewCenter(data, close):
d = dict()
for i in range(len(close)):
for j in range(len(close[0])):
d[close.iloc[i][j]] = []
for i in range(len(data)):
for j in range(len(data[0])):
if close.iloc[i][j] in d:
d[close.iloc[i][j]].append(data.iloc[i][j])
newCenters = []
for key, value in d.items():
m = np.mean(value)
newCenters.append(truncate(m, 3))
return newCenters
# lst = [[] * numcenters]
# for i in range(len(close)):
# for j in range(len(close[0])):
# if close.iloc[i][j]
def main():
data = np.array(pd.read_csv('https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv', header=None))
data = data.T
#print(data)
df = pd.DataFrame(data[1:], columns=data[0], dtype=float).T
df = df.iloc[::-1]
# print(df)
# print(df.iloc[1][9])
# print(df)
# print(df.iloc[0][1])
# heatmap = plt.pcolor(df, cmap=plt.cm.bwr)
# plt.colorbar(heatmap)
c = chooseCenter(df, 3)
print(c)
#print(len(c))
dist = getDistances(c, df)
#print(dist)
y = findClosest(df, dist)
# q = []
# for i in range(len(c)):
# q.append([])
# #print(q)
j = computeNewCenter(df, y)
#print(j)
length = df.shape
oldFrame = pd.DataFrame(np.ndarray((length[0],length[1])))
oldFrame = oldFrame.fillna(0)
ct=0
while y.equals(oldFrame) == False:
ct+=1
oldFrame = y.copy()
c = computeNewCenter(df, oldFrame)
#print(c)
dist = getDistances(c, df)
#print(dist)
y = findClosest(df, dist)
#print(y)
#plt.pcolor(df, cmap=plt.cm.bwr)
l = []
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 1:
l.append(df.iloc[i][j])
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 2:
l.append(df.iloc[i][j])
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 0:
l.append(df.iloc[i][j])
l = np.ndarray((length[0],length[1]))
l = pd.DataFrame(l)
print(l)
hm = plt.pcolor(l, cmap=plt.cm.bwr)
plt.colorbar(hm)
# print(y)
# print(c)
# print(ct)
#plt.pcolor(y, cmap=plt.cm.bwr)
if __name__ == '__main__':
main()
My line of thinking was this:
My current thought process was to first randomly choose the centers.
Then create a list of lists for each point for the distance to each center.
Find the index of the minimum distance for each point for each center.
Create a data frame of the same size as the data set and fill each index for each element with the index of the center the point is closest to.
Recompute the center by taking the mean of the points with the same center index
Repeat this process multiple times until the index data frame does not change.
Create a new data frame and add the points which have the same center point close together in the frame.
Then create the heatmap.
This did not seem to work though.
Just wondering, am I on the right track or am I completely off, and if I am on the right track which parts would I need to change in order to fix the issue. If not could you please point me on the right track.
Here is a comparison of the maps:
Here are the maps
The first one is the one my program generated while the second is the way it is supposed to look.
I know my problem lies in some part of the k-means clustering algorithm, and my guess is it is either in the reassignment stage where you reassign the points to the centroids and calculate the new centroids or in the stopping condition in that the algorithm does not run long enough. Also in the back of my head, something tells me that I am not doing this as efficiently as I could have and that I am missing something key. I have watched several videos on K-means clustering and understand it conceptually, I'm just having a hard time implementing it.

Dendrogram plotting in a loop with scipy.cluster.hierarchy.dendrogram function

I'm trying to build some dendrograms in a loop: in every iteration of this loop Distance Matrix is recalculated. However, figures with dendrogram look like they have some (n) dendrogram on it, where n is number of iteration in loop. How can I 'delete' previous dendrogram?
Here it is the part of script I use:
number = 1
for clique_list in cliq_per_chrom:
print number
chrlen = len(clique_list)
print 'calculating DistMatrix'
DistMatrix = np.zeros(shape=(chrlen,chrlen))
#FILLING DISTMATRIX
for i in range(chrlen):
for j in range(chrlen):
if i < j:
recdistlist = []
f_clique = clique_list[i].split('\t')
s_clique = clique_list[j].split('\t')
for pair_cont in itertools.product(f_clique, s_clique):
if pair_cont[0] != pair_cont[1]:
recombDistCorr = pair_rec_dist(pair_cont)
else:
recombDistCorr = 0.0
recdistlist.append(recombDistCorr)
if 'INF' not in recdistlist:
rec_dist_clq = np.mean(recdistlist)
else:
rec_dist_clq = 1.14877996253
DistMatrix[i][j] = rec_dist_clq
DistMatrix[j][i] = rec_dist_clq
#DENDROGRAM BUILDING
linkage_matrix = linkage(DistMatrix, 'average')
dend = dendrogram(linkage_matrix,leaf_rotation=90., leaf_font_size=8.)
plt.title('Hierarchical Clustering Dendrogram of Chromosome'+str(number))
plt.xlabel('Clique of contigs index')
plt.ylabel('Distance')
plt.savefig('Hierarchical Clustering Dendrogram of Chromosome'+str(number)+'.png',dpi=100)
number +=1

how to draw rectangles using list in python

for line, images_files in zip(lines, image_list):
info = line.split(',')
image_index = [int(info[0])]
box_coordiante1 = [info[2]]
box_coordiante2 = [info[3]]
box_coordiante3 = [info[4]]
box_coordiante4 = [info[5]]
prev_image_num = 1
for image_number in image_index: #### read each other image_number
if prev_image_num != image_number: # if read 11111 but appear different number such as 2, 3 and ect
prev_image_num = image_number # the different number becomes pre_image_num(it was 1)
#box_coordinate = [] # empty box_coordinate
#box_coordinate.append(info[2:6])
#print box_coordinate
# box_coordinate.append() #insert 2 to 6 axis
rect = plt.Rectangle((int(box_coordiante1), int(box_coordiante2)), int(box_coordiante3), int(box_coordiante4), linewidth=1, edgecolor='r', facecolor='none')
ax.add_patch(rect)
im = cv2.imread(images_files)
im = im[:, :, (2, 1, 0)]
# # Display the image
plt.imshow(im)
plt.draw()
plt.pause(0.1)
plt.cla()
I am supposed to draw boxes on each picture.
For showing boxes on each picture,
i guess that gather location of boxes and show them at that same time.
So i used a way using LIST to plt.Rectanle
but it said "TypeError: int() argument must be a string or a number, not 'list'"
Are there other ways??
Umm, I just did just. I don't know if this is what you wanted though.
x = 10
y = 10
a = []
for unit for range(x):
a.append(0)
for unit for range(y):
print(a)
I'm not very familiar with Python, but it seems like you want a plain number in the variables image_index and box_coordinateN. It looks like you're assigning single-element arrays to them. Try changing:
image_index = [int(info[0])] // list containing one element: int(info[0])
box_coordiante1 = [info[2]]
box_coordiante2 = [info[3]]
box_coordiante3 = [info[4]]
box_coordiante4 = [info[5]]
to:
image_index = int(info[0]) // number: int(info[0])
box_coordiante1 = info[2]
box_coordiante2 = info[3]
box_coordiante3 = info[4]
box_coordiante4 = info[5]
The answer above is carelessly sloppy and incorrect Python.
It must be rewritten and corrected as follows:
x = 10
y = 10
a = []
for unit in range(x):
a.append(0)
for unit in range(y):
print(a)

Categories

Resources