Dendrogram plotting in a loop with scipy.cluster.hierarchy.dendrogram function - python

I'm trying to build some dendrograms in a loop: in every iteration of this loop Distance Matrix is recalculated. However, figures with dendrogram look like they have some (n) dendrogram on it, where n is number of iteration in loop. How can I 'delete' previous dendrogram?
Here it is the part of script I use:
number = 1
for clique_list in cliq_per_chrom:
print number
chrlen = len(clique_list)
print 'calculating DistMatrix'
DistMatrix = np.zeros(shape=(chrlen,chrlen))
#FILLING DISTMATRIX
for i in range(chrlen):
for j in range(chrlen):
if i < j:
recdistlist = []
f_clique = clique_list[i].split('\t')
s_clique = clique_list[j].split('\t')
for pair_cont in itertools.product(f_clique, s_clique):
if pair_cont[0] != pair_cont[1]:
recombDistCorr = pair_rec_dist(pair_cont)
else:
recombDistCorr = 0.0
recdistlist.append(recombDistCorr)
if 'INF' not in recdistlist:
rec_dist_clq = np.mean(recdistlist)
else:
rec_dist_clq = 1.14877996253
DistMatrix[i][j] = rec_dist_clq
DistMatrix[j][i] = rec_dist_clq
#DENDROGRAM BUILDING
linkage_matrix = linkage(DistMatrix, 'average')
dend = dendrogram(linkage_matrix,leaf_rotation=90., leaf_font_size=8.)
plt.title('Hierarchical Clustering Dendrogram of Chromosome'+str(number))
plt.xlabel('Clique of contigs index')
plt.ylabel('Distance')
plt.savefig('Hierarchical Clustering Dendrogram of Chromosome'+str(number)+'.png',dpi=100)
number +=1

Related

Eliminating Certain Values in Dataframe

Initial Data
d = {'RedVal':[1,1.1,2,1.5,1.7,2,1,1.1,2,1,1.1,2,2.6,2.5,2.4,2.5], 'GreenVal':[1,1.1,1.1,1,1.1,1.7,1,1.1,1.5,1,1.9,3,2.8,2.7,2.6,2.5],'Frame':[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3],'Particle':[0,0,0,0,2,2,2,2,3,3,3,3,4,4,4,4] }
testframe = pd.DataFrame(data=d)
testframe
framenot = 2 #set how many frames you would like to get initial ratio for
ratarray = [] #initialize blank ratio array
testframe.sort_values(by =[ 'Particle', 'Frame'])
for particle in range(0,5):
if(testframe['Particle']== particle).any() == False:
particle = particle + 1
else:
newframe = testframe.loc[(testframe['Frame']<= framenot) & (testframe['Particle'] == particle)]
#print(particle)
for i in range(framenot):
#print(i)
GVal = newframe['GreenVal'].values[i]
RVal = newframe['RedVal'].values[i]
ratio = RVal/GVal
#print(RVal)
#print(GVal)
#print(ratio)
ratarray.append(ratio)
i+=1
#print(ratarray)
particle+=1
ratarray = np.array(ratarray)
avgRatios = np.average(ratarray.reshape(-1,framenot), axis = 1)
stdRatios = np.std(ratarray.reshape(-1,framenot), axis = 1)
print(avgRatios) #array with average ratios over set frames starting from initial particle
print(stdRatios)
So far I have code that gives the avg and standard deviation for each particle's ratio of Red/Green over the frames 0 and 1. Now I want to compare this avg ratio to the ratio for the next x frames and eliminate particles where the subsequent frames ratios falls outside the avg+2stdev. Not quite sure how to do this. Any help is appreciated.

Scikit learn: Why is my k-means clustering scatter plot so ugly?

def get_nouns (text):
tagger = MeCab.Tagger()
words = []
for c in tagger.parse(text).splitlines()[:-1]:
if len(c.split('\t')) < 2:
continue
surface, feature = c.split('\t')
pos = feature.split(',')[0]
if pos == '名詞': # noun
words.append(surface)
return ' '.join(words)
def bio():
biolist =[]
howmany = 10
for giin in read:
if len(giin["education"]) < 1:
continue
biolist.append(get_nouns(" ".join(giin["education"])))
######################################################
nparray = np.array (biolist)
cv = CountVectorizer()
bags = cv.fit_transform(nparray)
tfidf=TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(bags)
km_model = KMeans(n_clusters=howmany, init='k-means++')
km_model.fit_transform(tfidf)
lsa2 = TruncatedSVD(2)
compressed_text_list = lsa2.fit_transform(tfidf)
compressed_center_list = lsa2.fit_transform(km_model.cluster_centers_)
X = []
Y = []
X_cent = []
Y_cent = []
for x, y in compressed_text_list:
X.append(x)
Y.append(y)
for x, y in compressed_center_list:
X_cent.append(x)
Y_cent.append(y)
clus_list = []
for i in range (howmany):
clus_list.append([])
for a in biolist:
if km_model.labels_[biolist.index(a)] == i:
clus_list[i].append(a)
for a in clus_list:
print (a)
print (" ")
plt.scatter(X, Y, c = km_model.labels_)
plt.scatter(X_cent, Y_cent, c="r", marker = "+")
plt.show()
I have this code that clusters educational history of people into 10 groups. My scatter plot looks like this.
As you can see, the scatter plot is not really sorted into groups and different colors are mixed up with one another. How could this code be changed to make the grouping more precise?

Perceptron learning algorithm: Is PLA impossible to halt at some specific number of update?

I'm currently learning about PLA, and found something weird about my implement.
First of all, the data is linear separable, so it converges every time.
I implemented my PLA by visiting examples in a fixed, pre-determined random cycle, and repeated the experiment for a large number of times.
When I plot the number of updates v.s. frequency, I found out there are some number of update having frequency 0, for example, it never halts with 34, 36, 38, 40 updates, why is this?
Also, why isn't the plot a single peak curve but rather a 3 peak curve?
Train Data
My PLA plot
Here is my implement:
import numpy as np
import random as rd
import matplotlib.pyplot as plt
#Read Data
Data_X = []
Data_Y = []
Train_Data = "hw1_7_train"
f = open(Train_Data, "r")
if f.mode == "r":
fl = f.readlines()
for line in fl:
Data = line.split()
Data_X.append([1.0] + [float(i) for i in Data[0:4]])
Data_Y.append(int(Data[4]))
f.close()
Data_X = np.array(Data_X)
Data_Y = np.array(Data_Y)
def GoNext(Pos, Length):
if Pos < Length-1:
Pos += 1
else:
Pos = 0
return Pos
def sign(x):
return -1 if x <= 0 else 1
def PLA(X, Y, cycle):
weight = np.array([0.0] * 5)
Length = len(Y)
Success = 0
LastFail = -1
Current_Id = 0
UpdateCount = 0
while(not Success):
Current_Pos = cycle[Current_Id]
Sign = sign(np.inner(weight, X[Current_Pos]))
#If the current point is correct, check if we can halt, else continue
if Sign == Y[Current_Pos]:
#Went for a full round without mistake with initial weight (NOT likely to happen)
if LastFail == -1 and Current_Id == Length-1:
Success = 1
continue
#Went for a full round with out mistake from last point we modified
elif Current_Pos == LastFail:
Success = 1
continue
#Otherwise go to the next point
else:
Current_Id = GoNext(Current_Id, Length)
continue
#If the current point is error, modify the weight
else:
LastFail = Current_Pos
#Modify the weight
weight += Y[Current_Pos] * X[Current_Pos]
UpdateCount += 1
Current_Id = GoNext(Current_Id, Length)
continue
return UpdateCount
TotalIter = 9999
TrackUpdateCount = []
for iter in range(TotalIter):
Cycle = list(range(len(Data_Y)))
rd.shuffle(Cycle)
TrackUpdateCount.append(PLA(Data_X, Data_Y, Cycle))
print("Average Update: ", sum(TrackUpdateCount)/TotalIter)
#Plotting...
UpperBound = max(TrackUpdateCount)
LowerBound = min(TrackUpdateCount)
x_axis = list(range(LowerBound, UpperBound+1))
y_axis = [0]*(UpperBound-LowerBound+1)
for i in range(LowerBound, UpperBound+1):
y_axis[i-LowerBound] = TrackUpdateCount.count(i)
plt.bar(x_axis, y_axis)
plt.xlabel("Number of updates")
plt.ylabel("Frequency")
plt.show()

K-Means clustering multidimensional data with a heatmap

I have been trying to implement k-means clustering with a heatmap, but have been unsuccessful.
Here is the initial data set:
https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv
And here is my code:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import random
#%matplotlib inline
def truncate(f, n):
return math.floor(f * 10 ** n) / 10 ** n
def chooseCenter(data, centers):
length = data.shape
cent = []
while len(cent) < centers :
x = random.randrange(0,length[0])
y = random.randrange(0,length[1])
if data.iloc[x][y] not in cent:
d = truncate(data.iloc[x][y],2)
cent.append(d)
return cent
def distance(val, center):
return math.sqrt((val- center)**2)
def getDistances(centers, data):
length = data.shape
dist = []
for i in range(length[0]):
for j in range(length[1]):
y = []
for k in range(len(centers)):
val = distance(data.iloc[i][j], centers[k])
y.append(truncate(val,3))
dist.append(y)
return dist
def findClosest(data, dist):
close = data.copy()
length = close.shape
indexes = []
for i in range(len(dist)):
pt = min(dist[i])
idx = dist[i].index(pt)
indexes.append(idx)
#print(indexes)
length = data.shape
n = np.array(indexes)
n = pd.DataFrame(np.reshape(n, (length[0],length[1])))
#reshape this data frame into the same shape as the data
#keep running the find closest until there is no change
#try heatmap on this?
#this should cluster it, but to make sure test it
#might need to do some tweaking to this
return n
# for i in range(length[0]):
# for j in range(length[1]):
# print('dist[i]', dist[j])
# pt = min(dist[j])
# print(pt)
# idx = dist[j].index(pt)
# close.iloc[i][j] = int(idx)
#return close
def computeNewCenter(data, close):
d = dict()
for i in range(len(close)):
for j in range(len(close[0])):
d[close.iloc[i][j]] = []
for i in range(len(data)):
for j in range(len(data[0])):
if close.iloc[i][j] in d:
d[close.iloc[i][j]].append(data.iloc[i][j])
newCenters = []
for key, value in d.items():
m = np.mean(value)
newCenters.append(truncate(m, 3))
return newCenters
# lst = [[] * numcenters]
# for i in range(len(close)):
# for j in range(len(close[0])):
# if close.iloc[i][j]
def main():
data = np.array(pd.read_csv('https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv', header=None))
data = data.T
#print(data)
df = pd.DataFrame(data[1:], columns=data[0], dtype=float).T
df = df.iloc[::-1]
# print(df)
# print(df.iloc[1][9])
# print(df)
# print(df.iloc[0][1])
# heatmap = plt.pcolor(df, cmap=plt.cm.bwr)
# plt.colorbar(heatmap)
c = chooseCenter(df, 3)
print(c)
#print(len(c))
dist = getDistances(c, df)
#print(dist)
y = findClosest(df, dist)
# q = []
# for i in range(len(c)):
# q.append([])
# #print(q)
j = computeNewCenter(df, y)
#print(j)
length = df.shape
oldFrame = pd.DataFrame(np.ndarray((length[0],length[1])))
oldFrame = oldFrame.fillna(0)
ct=0
while y.equals(oldFrame) == False:
ct+=1
oldFrame = y.copy()
c = computeNewCenter(df, oldFrame)
#print(c)
dist = getDistances(c, df)
#print(dist)
y = findClosest(df, dist)
#print(y)
#plt.pcolor(df, cmap=plt.cm.bwr)
l = []
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 1:
l.append(df.iloc[i][j])
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 2:
l.append(df.iloc[i][j])
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 0:
l.append(df.iloc[i][j])
l = np.ndarray((length[0],length[1]))
l = pd.DataFrame(l)
print(l)
hm = plt.pcolor(l, cmap=plt.cm.bwr)
plt.colorbar(hm)
# print(y)
# print(c)
# print(ct)
#plt.pcolor(y, cmap=plt.cm.bwr)
if __name__ == '__main__':
main()
My line of thinking was this:
My current thought process was to first randomly choose the centers.
Then create a list of lists for each point for the distance to each center.
Find the index of the minimum distance for each point for each center.
Create a data frame of the same size as the data set and fill each index for each element with the index of the center the point is closest to.
Recompute the center by taking the mean of the points with the same center index
Repeat this process multiple times until the index data frame does not change.
Create a new data frame and add the points which have the same center point close together in the frame.
Then create the heatmap.
This did not seem to work though.
Just wondering, am I on the right track or am I completely off, and if I am on the right track which parts would I need to change in order to fix the issue. If not could you please point me on the right track.
Here is a comparison of the maps:
Here are the maps
The first one is the one my program generated while the second is the way it is supposed to look.
I know my problem lies in some part of the k-means clustering algorithm, and my guess is it is either in the reassignment stage where you reassign the points to the centroids and calculate the new centroids or in the stopping condition in that the algorithm does not run long enough. Also in the back of my head, something tells me that I am not doing this as efficiently as I could have and that I am missing something key. I have watched several videos on K-means clustering and understand it conceptually, I'm just having a hard time implementing it.

In a loop - match 'i' with an index value from a csv (Python/Networkx)

I'm currently trying to draw some edges in Networkx, my nodes have 2 patch properties, position and status which are used in a colonisation simulation algorithm. I've been trying to scale up my simulation which has meant turning away from working out euclidean distances between my nodes (and also away from code that works!).
I have a csv of the row number index of the nearest neighbours of each node, this index corresponding to the row of another csv which has the 3d co-ordinates of the nodes contained. i.e., on the nearest neighbour csv on row 0 may have 3 nearest neighbours on the same row in separate columns so it would be 0, 56, 76 if node 0 had nearest neighbours in node 56 and 76 which would correspond to rows 0, 56 and 76 on the co-ord csv.
I then need to draw edges between these nearest neighbour nodes for my algorithm to play with the nodes. So I have some pseudo-code:
import networkx as nx
import numpy as np
from sklearn.neighbors import BallTree
import csv
from itertools import izip_longest
import pandas as pd
density = 0.14 #Stellar density per cubic parsec
L = 100
Patches = int(0.056*density*L**3+15)
P_init = 0.0001 # Probability that a patch will be occupied at the beginning
Distance = 10
dat = np.random.uniform(low = -1, high = 1, size = (Patches,3)) * L
np.savetxt('nearand1.csv', dat, delimiter = ',')
nearand = np.genfromtxt('nearand1.csv', delimiter = ',',usecols=np.arange(0, 3))
tree = BallTree(nearand, leaf_size=2)
ind = tree.query_radius(nearand, r=10)
df = pd.DataFrame(ind)
df.to_csv('bobbington4.csv',sep='e',index=False, header=False)
xcoord = nearand[:,0]
ycoord = nearand[:,1]
zcoord = nearand[:,2]
bobbington = np.genfromtxt('bobbington4.csv', delimiter = ',', dtype = 'int')
bobbington0 = bobbington[:,0]
bobbington1 = bobbington[:,1]
bobbington2 = bobbington[:,2]
bobbington3 = bobbington[:,3]
bobbington4 = bobbington[:,4]
bobbington5 = bobbington[:,5]
bobbington6 = bobbington[:,6]
bobbington7 = bobbington[:,7]
bobbington8 = bobbington[:,8]
bobbington9 = bobbington[:,9]
bobbington10 = bobbington[:,10]
bobbington11 = bobbington[:,11]
bobbington12 = bobbington[:,12]
bobbington13 = bobbington[:,13]
class patch:
def __init__(self,status=0,pos=(0,0,0)):
self.status = status
self.pos = pos
def __str__(self):
return(str(self.status))
G = nx.Graph()
for i in xrange(Patches):
Stat = 1 if np.random.uniform() < P_init else 0
Pos = (xcoord[i], ycoord[i], zcoord[i])
G.add_node(patch(Stat,Pos))
for i in G.nodes():
for j in G.nodes():
if i.pos where i == bobbington0:
if j.pos where j == bobbington1:
G.add_edge(i,j)
pos = {}
for n in G.nodes():
pos[n] = n.pos
occup = [n.status for n in G]
Time = [0]
Occupancy = [np.sum([n.status for n in G])/float(Patches)]
Here bobbington0 is just a column of node indices going from 0 -> 7854 and bibbington1 is the first nearest neighbour for each of those nodes. What is is the best way to go about this? I'm struggling to find anything on this type of problem but I'm probably wording things poorly.
Thanks in advance for any help you can give me.
I've got it. Not particularly elegant but it works.
for i in G.nodes():
for j in G.nodes():
diff1 = j.boba[0] - i.bubu
if diff1 == 0:
G.add_edge(i, j)

Categories

Resources