How to Maintain Segmented Cell Numbering On Moving Tissue - python

The overall goal of the project is to identify cells that are undergoing a wave-like event and determine if cell area changes after the event occurs (red arrow). The raw file is a video of live frog epithelial tissue that has been exported frame-by-frame into multiple .tiff files. The image below is just one of the many frames used.
(https://i.stack.imgur.com/iFfYR.jpg)
Using Cellpose we have already been able to segment each frame of the video and determine the centroids. The image below is saved as .npy file as indicated in the code.
(https://i.stack.imgur.com/7XP0j.jpg)
We then go on to number each cell and calculate the area across all the frames we used.
(https://i.stack.imgur.com/xX7pY.png) (https://i.stack.imgur.com/JcvPj.png)
As you might be able to tell, this is where we run into our issues. Since we are using live epithelial tissue, there is movement in the original video file. As new cells come into frame throughout our raw video, the numbering becomes completely different. This means that the data compiled becomes virtually useless as we cannot tell which cell is the same between different frames (see image). Another (less pressing) issue is that the wave event is almost always identified as a cell by the Cellpose segmentation program regardless of how strict the threshold is set to.
(https://i.stack.imgur.com/zmvI6.png)
def graphData(filepath, oneImage, maxDist, minDist, savePath, length):
# graph data for all files in a folder
if oneImage == False:
for file in os.listdir(filepath):
filename = os.fsdecode(file)
temp = filepath + filename
if filename.endswith(".npy"):
a = len(filename)-7
save = savePath+filename[:a]+".tif"
graphData(temp,True,maxDist,minDist,save,length)
else:
#get data from .npy file
data = np.load(filepath, allow_pickle = True).item()
masks = data['masks']
properties = ('label', 'centroid')
prop_dict = regionprops_table(data['masks'], properties=properties)
centroidsDF = pd.DataFrame(prop_dict)
outlines = data['outlines']
results = []
n = 0
#find cell intersections
for x in range(len(masks)-2):
for y in range(len(masks[0])-2):
count = 0
temp = [0,masks[x][y]]
if masks[x][y] != n and x != 0 and y != 0:
for a in range(5):
for b in range(5):
if masks[x-2+a][y-2+b] not in temp:
count += 1
temp.append(masks[x-2+a][y-2+b])
if count >= 2:
results.append([x,y,count+1])
#remove intersections near the edge of the image
for a in results:
if a[0] < 20 or a[1] < 20:
results.remove(a)
#remove cell intersections that are too close (based on minDist)
for p in range(5):
for a in results:
nearest = 99999
for b in results:
dist = abs(math.sqrt((abs(a[0]-b[0])**2)+(abs(a[1]-b[1])**2)))
if dist <= nearest and dist != 0:
nearest = dist
for b in results:
dist = abs(math.sqrt((abs(a[0]-b[0])**2)+(abs(a[1]-b[1])**2)))
if dist <= minDist and dist == nearest:
results.remove(b)
#combine cell intersections that are too close (based on maxDist)
for p in range(5):
for a in results:
nearest = 99999
for b in results:
dist = abs(math.sqrt((abs(a[0]-b[0])**2)+(abs(a[1]-b[1])**2)))
if dist <= nearest and dist != 0:
nearest = dist
for b in results:
dist = abs(math.sqrt((abs(a[0]-b[0])**2)+(abs(a[1]-b[1])**2)))
if dist == nearest and dist <= maxDist:
avgX = (a[0]+b[0])/2
avgY = (a[1]+b[1])/2
Tcount = a[2]
if dist >= minDist:
Tcount = round((a[2]+b[2]+2)/2)
for i in range(len(results)-1):
if results[i]==a:
results[i]= [avgX, avgY, Tcount]
results.remove(b)
plt.clf()
# graph masks and outlines
for i in range(len(np.unique(masks))):
if i != 0:
plt.scatter(np.where(masks==i)[0],np.where(masks==i)[1],s=0.0001,c='0.9')
for i in range(len(np.unique(masks))):
if i != 0:
plt.scatter(np.where(outlines==i)[0],np.where(outlines==i)[1],s=0.001,c='0.4')
xs = [x[0] for x in results]
ys = [x[1] for x in results]
zs = [x[2] for x in results]
plt.scatter(xs,ys,s=.05,c='k')
#graph number of intersecting cells
#for a in range(len(xs)):
#plt.text(xs[a]+1,ys[a]+1,zs[a],size=3,c='k')
save = savePath+os.path.basename(filepath)[:len(os.path.basename(filepath))-3]+"txt"
areas = []
for x in range(len(np.unique(data['masks']))-1):
count = 0
for a in range(len(masks)):
for b in range(len(masks[0])):
if masks[a][b] == x:
count+=1
areas.append([x,count])
areas.pop(0)
count = 0
for a in areas:
count += a[1]
count /= (len(np.unique(data['masks']))-1)
if(exists(save)):
os.remove(save)
with open(save, 'w') as f:
f.write("average cell area: "+ str(round(count/length,5)))
f.write("\n\nnumber of cells: "+ str(len(np.unique(data['masks']))-1))
f.write("\n\narea of each cell (cell # and area): ")
for a in areas:
f.write("\n"+str(a[0])+", "+str(a[1]/length))
for a in range(len(centroidsDF)):
plt.text(centroidsDF['centroid-0'][a],centroidsDF['centroid-1'][a],centroidsDF['label'][a],size=3)
plt.savefig(savePath, dpi = 1440)
Is there a more efficient code that will help complete our project or does the movement of the tissue make it too difficult?

Related

Eliminating Certain Values in Dataframe

Initial Data
d = {'RedVal':[1,1.1,2,1.5,1.7,2,1,1.1,2,1,1.1,2,2.6,2.5,2.4,2.5], 'GreenVal':[1,1.1,1.1,1,1.1,1.7,1,1.1,1.5,1,1.9,3,2.8,2.7,2.6,2.5],'Frame':[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3],'Particle':[0,0,0,0,2,2,2,2,3,3,3,3,4,4,4,4] }
testframe = pd.DataFrame(data=d)
testframe
framenot = 2 #set how many frames you would like to get initial ratio for
ratarray = [] #initialize blank ratio array
testframe.sort_values(by =[ 'Particle', 'Frame'])
for particle in range(0,5):
if(testframe['Particle']== particle).any() == False:
particle = particle + 1
else:
newframe = testframe.loc[(testframe['Frame']<= framenot) & (testframe['Particle'] == particle)]
#print(particle)
for i in range(framenot):
#print(i)
GVal = newframe['GreenVal'].values[i]
RVal = newframe['RedVal'].values[i]
ratio = RVal/GVal
#print(RVal)
#print(GVal)
#print(ratio)
ratarray.append(ratio)
i+=1
#print(ratarray)
particle+=1
ratarray = np.array(ratarray)
avgRatios = np.average(ratarray.reshape(-1,framenot), axis = 1)
stdRatios = np.std(ratarray.reshape(-1,framenot), axis = 1)
print(avgRatios) #array with average ratios over set frames starting from initial particle
print(stdRatios)
So far I have code that gives the avg and standard deviation for each particle's ratio of Red/Green over the frames 0 and 1. Now I want to compare this avg ratio to the ratio for the next x frames and eliminate particles where the subsequent frames ratios falls outside the avg+2stdev. Not quite sure how to do this. Any help is appreciated.

How to code up an image stitching software for these 'simple' images?

TLDR:
Need help trying to calculate overlap region between 2 graphs.
So I'm trying to stitch these 2 images:
Since I know that the images I will be stitching definitely come from the same image, I feel that I should be able to code this up myself. Using libraries like OpenCV feels a little like overkill for me for this task.
My current idea is that I can simplify this task by doing the following steps for each image:
Load image using PIL
Convert image to black and white (PIL image mode ā€œLā€)
[Optional: crop images to overlapping region by inspection by eye]
Create vector row_sum, which is a sum of each row
[Optional: log row_sum, to reduce the size of values we're working with]
Plot row_sum.
This would reduce the (potentially) (3*2)-dimensional problem, with 3 RGB channels for each pixel on the 2D image to a (1*2)-D problem with the black and white pixel for the 2D image instead. Then, summing across the rows reduces this to a 1D problem.
I used the following code to implement the above:
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
class Stitcher():
def combine_2(self, img1, img2):
# thr1, thr2 = self.get_cropped_bw(img1, 115, img2, 80)
thr1, thr2 = self.get_cropped_bw(img1, 0, img2, 0)
row_sum1 = np.log(thr1.sum(1))
row_sum2 = np.log(thr2.sum(1))
self.plot_4x4(thr1, thr2, row_sum1, row_sum2)
def get_cropped_bw(self, img1, img1_keep_from, img2, img2_keep_till):
im1 = Image.open(img1).convert("L")
im2 = Image.open(img2).convert("L")
data1 = (np.array(im1)[img1_keep_from:]
if img1_keep_from != 0 else np.array(im1))
data2 = (np.array(im2)[:img2_keep_till]
if img2_keep_till != 0 else np.array(im2))
return data1, data2
def plot_4x4(self, thr1, thr2, row_sum1, row_sum2):
fig, ax = plt.subplots(2, 2, sharey="row", constrained_layout=True)
ax[0, 0].imshow(thr1, cmap="Greys")
ax[0, 1].imshow(thr2, cmap="Greys")
ax[1, 0].plot(row_sum1, "k.")
ax[1, 1].plot(row_sum2, "r.")
ax[1, 0].set(
xlabel="Index Value",
ylabel="Row Sum",
)
plt.show()
imgs = (r"combine\imgs\test_image_part_1.jpg",
r"combine\imgs\test_image_part_2.jpg")
s = Stitcher()
s.combine_2(*imgs)
This gave me this graph:
(I've added in those yellow boxes, to indicate the overlap regions.)
This is the bit I'm stuck at. I want to find exactly:
the index value of the left-side of the yellow box for the 1st image and
the index value of the right-side of the yellow box for the 2nd image.
I define the overlap region as the longest range for which the end of the 1st graph 'matches' the start of the 2nd graph. For the method to find the overlap region, what should I do if the row sum values aren't exactly the same (what if one is the other scaled by some factor)?
I feel like this could be a problem that could use dot products to find the similarity between the 2 graphs? But I can't think of how to implement this.
I had a lot more fun with this than I expected. I wrote this using opencv, but that's just to load and show the image. Everything else is done with numpy so swapping this to PIL shouldn't be too difficult.
I'm using a brute-force matcher. I also wrote a random-start hillclimber that runs in much less time, but I can't guarantee it'll find the correct answer since the gradient space isn't smooth. I won't include it in my code since it's long and janky, but if you really need the time efficiency I can add it back in later.
I added a random crop and some salt and pepper noise to the images to test for robustness.
The brute-force matcher operates on the idea that we don't know which section of the two images overlap, so we need to convolve the smaller image over the larger image from left to right, top to bottom. This means our search space is:
horizontal = small_width + big_width
vertical = small_height + big_height
area = horizontal * vertical
This will grow very quickly with image size. I motivate the algorithm by giving it points for having a larger overlap, but it loses more points for having differences in color for the overlapped area.
Here are some pictures from an execution of this program
import cv2
import numpy as np
import random
# randomly snips edges
def randCrop(image, maxMargin):
c = [random.randint(0,maxMargin) for a in range(4)];
return image[c[0]:-c[1], c[2]:-c[3]];
# adds noise to image
def saltPepper(image, minNoise, maxNoise):
h,w = image.shape;
randNum = random.randint(minNoise, maxNoise);
for a in range(randNum):
x = random.randint(0, w-1);
y = random.randint(0, h-1);
image[y,x] = random.randint(0, 255);
return image;
# evaluate layout
def getScore(one, two):
# do raw subtraction
left = one - two;
right = two - one;
sub = np.minimum(left, right);
return np.count_nonzero(sub);
# return 2d random position within range
def randPos(img, big_shape):
th,tw = big_shape;
h,w = img.shape;
x = random.randint(0, tw - w);
y = random.randint(0, th - h);
return [x,y];
# overlays small image onto big image
def overlay(small, big, pos):
# unpack
h,w = small.shape;
x,y = pos;
# copy and place
copy = big.copy();
copy[y:y+h, x:x+w] = small;
return copy;
# calculates overlap region
def overlap(one, two, pos_one, pos_two):
# unpack
h1,w1 = one.shape;
h2,w2 = two.shape;
x1,y1 = pos_one;
x2,y2 = pos_two;
# set edges
l1 = x1;
l2 = x2;
r1 = x1 + w1;
r2 = x2 + w2;
t1 = y1;
t2 = y2;
b1 = y1 + h1;
b2 = y2 + h2;
# go
left = max(l1, l2);
right = min(r1, r2);
top = max(t1, t2);
bottom = min(b1, b2);
return [left, right, top, bottom];
# wrapper for overlay + getScore
def fullScore(one, two, pos_one, pos_two, big_empty):
# check positions
x,y = pos_two;
h,w = two.shape;
th,tw = big_empty.shape;
if y+h > th or x+w > tw or x < 0 or y < 0:
return -99999999;
# overlay
temp_one = overlay(one, big_empty, pos_one);
temp_two = overlay(two, big_empty, pos_two);
# get overlap
l,r,t,b = overlap(one, two, pos_one, pos_two);
temp_one = temp_one[t:b, l:r];
temp_two = temp_two[t:b, l:r];
# score
diff = getScore(temp_one, temp_two);
score = (r-l) * (b-t);
score -= diff*2;
return score;
# do brute force
def bruteForce(one, two):
# calculate search space
# unpack size
h,w = one.shape;
one_size = h*w;
h,w = two.shape;
two_size = h*w;
# small and big
if one_size < two_size:
small = one;
big = two;
else:
small = two;
big = one;
# unpack size
sh, sw = small.shape;
bh, bw = big.shape;
total_width = bw + sw * 2;
total_height = bh + sh * 2;
# set up empty images
empty = np.zeros((total_height, total_width), np.uint8);
# set global best
best_score = -999999;
best_pos = None;
# start scrolling
ybound = total_height - sh;
xbound = total_width - sw;
for y in range(ybound):
print("y: " + str(y) + " || " + str(empty.shape));
for x in range(xbound):
# get score
score = fullScore(big, small, [sw,sh], [x,y], empty);
# show
# prog = overlay(big, empty, [sw,sh]);
# prog = overlay(small, prog, [x,y]);
# cv2.imshow("prog", prog);
# cv2.waitKey(1);
# compare
if score > best_score:
best_score = score;
best_pos = [x,y];
print("best_score: " + str(best_score));
return best_pos, [sw,sh], small, big, empty;
# do a step of hill climber
def hillStep(one, two, best_pos, big_empty, step):
# make a step
new_pos = best_pos[1][:];
new_pos[0] += step[0];
new_pos[1] += step[1];
# get score
return fullScore(one, two, best_pos[0], new_pos, big_empty), new_pos;
# hunt around for good position
# let's do a random-start hillclimber
def randHill(one, two, shape):
# set up empty images
big_empty = np.zeros(shape, np.uint8);
# set global best
g_best_score = -999999;
g_best_pos = None;
# lets do 200 iterations
iters = 200;
for a in range(iters):
# progress check
print(str(a) + " of " + str(iters));
# start with random position
h,w = two.shape[:2];
pos_one = [w,h];
pos_two = randPos(two, shape);
# get score
best_score = fullScore(one, two, pos_one, pos_two, big_empty);
best_pos = [pos_one, pos_two];
# hill climb (only on second image)
while True:
# end condition: no step improves score
end_flag = True;
# 8-way
for y in range(-1, 1+1):
for x in range(-1, 1+1):
if x != 0 or y != 0:
# get score and update
score, new_pos = hillStep(one, two, best_pos, big_empty, [x,y]);
if score > best_score:
best_score = score;
best_pos[1] = new_pos[:];
end_flag = False;
# end
if end_flag:
break;
else:
# show
# prog = overlay(one, big_empty, best_pos[0]);
# prog = overlay(two, prog, best_pos[1]);
# cv2.imshow("prog", prog);
# cv2.waitKey(1);
pass;
# check for new global best
if best_score > g_best_score:
g_best_score = best_score;
g_best_pos = best_pos[:];
print("top score: " + str(g_best_score));
return g_best_score, g_best_pos;
# load both images
top = cv2.imread("top.jpg");
bottom = cv2.imread("bottom.jpg");
top = cv2.cvtColor(top, cv2.COLOR_BGR2GRAY);
bottom = cv2.cvtColor(bottom, cv2.COLOR_BGR2GRAY);
# randomly crop
top = randCrop(top, 20);
bottom = randCrop(bottom, 20);
# randomly add noise
saltPepper(top, 200, 1000);
saltPepper(bottom, 200, 1000);
# set up max image (assume no overlap whatsoever)
tw = 0;
th = 0;
h, w = top.shape;
tw += w;
th += h;
h, w = bottom.shape;
tw += w*2;
th += h*2;
# do random-start hill climb
_, best_pos = randHill(top, bottom, (th, tw));
# show
empty = np.zeros((th, tw), np.uint8);
pos1, pos2 = best_pos;
image = overlay(top, empty, pos1);
image = overlay(bottom, image, pos2);
# do brute force
# small_pos, big_pos, small, big, empty = bruteForce(top, bottom);
# image = overlay(big, empty, big_pos);
# image = overlay(small, image, small_pos);
# recolor overlap
h,w = empty.shape;
color = np.zeros((h,w,3), np.uint8);
l,r,t,b = overlap(top, bottom, pos1, pos2);
color[:,:,0] = image;
color[:,:,1] = image;
color[:,:,2] = image;
color[t:b, l:r, 0] += 100;
# show images
cv2.imshow("top", top);
cv2.imshow("bottom", bottom);
cv2.imshow("overlayed", image);
cv2.imshow("Color", color);
cv2.waitKey(0);
Edit: I added in the random-start hillclimber

Perceptron learning algorithm: Is PLA impossible to halt at some specific number of update?

I'm currently learning about PLA, and found something weird about my implement.
First of all, the data is linear separable, so it converges every time.
I implemented my PLA by visiting examples in a fixed, pre-determined random cycle, and repeated the experiment for a large number of times.
When I plot the number of updates v.s. frequency, I found out there are some number of update having frequency 0, for example, it never halts with 34, 36, 38, 40 updates, why is this?
Also, why isn't the plot a single peak curve but rather a 3 peak curve?
Train Data
My PLA plot
Here is my implement:
import numpy as np
import random as rd
import matplotlib.pyplot as plt
#Read Data
Data_X = []
Data_Y = []
Train_Data = "hw1_7_train"
f = open(Train_Data, "r")
if f.mode == "r":
fl = f.readlines()
for line in fl:
Data = line.split()
Data_X.append([1.0] + [float(i) for i in Data[0:4]])
Data_Y.append(int(Data[4]))
f.close()
Data_X = np.array(Data_X)
Data_Y = np.array(Data_Y)
def GoNext(Pos, Length):
if Pos < Length-1:
Pos += 1
else:
Pos = 0
return Pos
def sign(x):
return -1 if x <= 0 else 1
def PLA(X, Y, cycle):
weight = np.array([0.0] * 5)
Length = len(Y)
Success = 0
LastFail = -1
Current_Id = 0
UpdateCount = 0
while(not Success):
Current_Pos = cycle[Current_Id]
Sign = sign(np.inner(weight, X[Current_Pos]))
#If the current point is correct, check if we can halt, else continue
if Sign == Y[Current_Pos]:
#Went for a full round without mistake with initial weight (NOT likely to happen)
if LastFail == -1 and Current_Id == Length-1:
Success = 1
continue
#Went for a full round with out mistake from last point we modified
elif Current_Pos == LastFail:
Success = 1
continue
#Otherwise go to the next point
else:
Current_Id = GoNext(Current_Id, Length)
continue
#If the current point is error, modify the weight
else:
LastFail = Current_Pos
#Modify the weight
weight += Y[Current_Pos] * X[Current_Pos]
UpdateCount += 1
Current_Id = GoNext(Current_Id, Length)
continue
return UpdateCount
TotalIter = 9999
TrackUpdateCount = []
for iter in range(TotalIter):
Cycle = list(range(len(Data_Y)))
rd.shuffle(Cycle)
TrackUpdateCount.append(PLA(Data_X, Data_Y, Cycle))
print("Average Update: ", sum(TrackUpdateCount)/TotalIter)
#Plotting...
UpperBound = max(TrackUpdateCount)
LowerBound = min(TrackUpdateCount)
x_axis = list(range(LowerBound, UpperBound+1))
y_axis = [0]*(UpperBound-LowerBound+1)
for i in range(LowerBound, UpperBound+1):
y_axis[i-LowerBound] = TrackUpdateCount.count(i)
plt.bar(x_axis, y_axis)
plt.xlabel("Number of updates")
plt.ylabel("Frequency")
plt.show()

K-Means clustering multidimensional data with a heatmap

I have been trying to implement k-means clustering with a heatmap, but have been unsuccessful.
Here is the initial data set:
https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv
And here is my code:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import random
#%matplotlib inline
def truncate(f, n):
return math.floor(f * 10 ** n) / 10 ** n
def chooseCenter(data, centers):
length = data.shape
cent = []
while len(cent) < centers :
x = random.randrange(0,length[0])
y = random.randrange(0,length[1])
if data.iloc[x][y] not in cent:
d = truncate(data.iloc[x][y],2)
cent.append(d)
return cent
def distance(val, center):
return math.sqrt((val- center)**2)
def getDistances(centers, data):
length = data.shape
dist = []
for i in range(length[0]):
for j in range(length[1]):
y = []
for k in range(len(centers)):
val = distance(data.iloc[i][j], centers[k])
y.append(truncate(val,3))
dist.append(y)
return dist
def findClosest(data, dist):
close = data.copy()
length = close.shape
indexes = []
for i in range(len(dist)):
pt = min(dist[i])
idx = dist[i].index(pt)
indexes.append(idx)
#print(indexes)
length = data.shape
n = np.array(indexes)
n = pd.DataFrame(np.reshape(n, (length[0],length[1])))
#reshape this data frame into the same shape as the data
#keep running the find closest until there is no change
#try heatmap on this?
#this should cluster it, but to make sure test it
#might need to do some tweaking to this
return n
# for i in range(length[0]):
# for j in range(length[1]):
# print('dist[i]', dist[j])
# pt = min(dist[j])
# print(pt)
# idx = dist[j].index(pt)
# close.iloc[i][j] = int(idx)
#return close
def computeNewCenter(data, close):
d = dict()
for i in range(len(close)):
for j in range(len(close[0])):
d[close.iloc[i][j]] = []
for i in range(len(data)):
for j in range(len(data[0])):
if close.iloc[i][j] in d:
d[close.iloc[i][j]].append(data.iloc[i][j])
newCenters = []
for key, value in d.items():
m = np.mean(value)
newCenters.append(truncate(m, 3))
return newCenters
# lst = [[] * numcenters]
# for i in range(len(close)):
# for j in range(len(close[0])):
# if close.iloc[i][j]
def main():
data = np.array(pd.read_csv('https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv', header=None))
data = data.T
#print(data)
df = pd.DataFrame(data[1:], columns=data[0], dtype=float).T
df = df.iloc[::-1]
# print(df)
# print(df.iloc[1][9])
# print(df)
# print(df.iloc[0][1])
# heatmap = plt.pcolor(df, cmap=plt.cm.bwr)
# plt.colorbar(heatmap)
c = chooseCenter(df, 3)
print(c)
#print(len(c))
dist = getDistances(c, df)
#print(dist)
y = findClosest(df, dist)
# q = []
# for i in range(len(c)):
# q.append([])
# #print(q)
j = computeNewCenter(df, y)
#print(j)
length = df.shape
oldFrame = pd.DataFrame(np.ndarray((length[0],length[1])))
oldFrame = oldFrame.fillna(0)
ct=0
while y.equals(oldFrame) == False:
ct+=1
oldFrame = y.copy()
c = computeNewCenter(df, oldFrame)
#print(c)
dist = getDistances(c, df)
#print(dist)
y = findClosest(df, dist)
#print(y)
#plt.pcolor(df, cmap=plt.cm.bwr)
l = []
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 1:
l.append(df.iloc[i][j])
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 2:
l.append(df.iloc[i][j])
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 0:
l.append(df.iloc[i][j])
l = np.ndarray((length[0],length[1]))
l = pd.DataFrame(l)
print(l)
hm = plt.pcolor(l, cmap=plt.cm.bwr)
plt.colorbar(hm)
# print(y)
# print(c)
# print(ct)
#plt.pcolor(y, cmap=plt.cm.bwr)
if __name__ == '__main__':
main()
My line of thinking was this:
My current thought process was to first randomly choose the centers.
Then create a list of lists for each point for the distance to each center.
Find the index of the minimum distance for each point for each center.
Create a data frame of the same size as the data set and fill each index for each element with the index of the center the point is closest to.
Recompute the center by taking the mean of the points with the same center index
Repeat this process multiple times until the index data frame does not change.
Create a new data frame and add the points which have the same center point close together in the frame.
Then create the heatmap.
This did not seem to work though.
Just wondering, am I on the right track or am I completely off, and if I am on the right track which parts would I need to change in order to fix the issue. If not could you please point me on the right track.
Here is a comparison of the maps:
Here are the maps
The first one is the one my program generated while the second is the way it is supposed to look.
I know my problem lies in some part of the k-means clustering algorithm, and my guess is it is either in the reassignment stage where you reassign the points to the centroids and calculate the new centroids or in the stopping condition in that the algorithm does not run long enough. Also in the back of my head, something tells me that I am not doing this as efficiently as I could have and that I am missing something key. I have watched several videos on K-means clustering and understand it conceptually, I'm just having a hard time implementing it.

In a loop - match 'i' with an index value from a csv (Python/Networkx)

I'm currently trying to draw some edges in Networkx, my nodes have 2 patch properties, position and status which are used in a colonisation simulation algorithm. I've been trying to scale up my simulation which has meant turning away from working out euclidean distances between my nodes (and also away from code that works!).
I have a csv of the row number index of the nearest neighbours of each node, this index corresponding to the row of another csv which has the 3d co-ordinates of the nodes contained. i.e., on the nearest neighbour csv on row 0 may have 3 nearest neighbours on the same row in separate columns so it would be 0, 56, 76 if node 0 had nearest neighbours in node 56 and 76 which would correspond to rows 0, 56 and 76 on the co-ord csv.
I then need to draw edges between these nearest neighbour nodes for my algorithm to play with the nodes. So I have some pseudo-code:
import networkx as nx
import numpy as np
from sklearn.neighbors import BallTree
import csv
from itertools import izip_longest
import pandas as pd
density = 0.14 #Stellar density per cubic parsec
L = 100
Patches = int(0.056*density*L**3+15)
P_init = 0.0001 # Probability that a patch will be occupied at the beginning
Distance = 10
dat = np.random.uniform(low = -1, high = 1, size = (Patches,3)) * L
np.savetxt('nearand1.csv', dat, delimiter = ',')
nearand = np.genfromtxt('nearand1.csv', delimiter = ',',usecols=np.arange(0, 3))
tree = BallTree(nearand, leaf_size=2)
ind = tree.query_radius(nearand, r=10)
df = pd.DataFrame(ind)
df.to_csv('bobbington4.csv',sep='e',index=False, header=False)
xcoord = nearand[:,0]
ycoord = nearand[:,1]
zcoord = nearand[:,2]
bobbington = np.genfromtxt('bobbington4.csv', delimiter = ',', dtype = 'int')
bobbington0 = bobbington[:,0]
bobbington1 = bobbington[:,1]
bobbington2 = bobbington[:,2]
bobbington3 = bobbington[:,3]
bobbington4 = bobbington[:,4]
bobbington5 = bobbington[:,5]
bobbington6 = bobbington[:,6]
bobbington7 = bobbington[:,7]
bobbington8 = bobbington[:,8]
bobbington9 = bobbington[:,9]
bobbington10 = bobbington[:,10]
bobbington11 = bobbington[:,11]
bobbington12 = bobbington[:,12]
bobbington13 = bobbington[:,13]
class patch:
def __init__(self,status=0,pos=(0,0,0)):
self.status = status
self.pos = pos
def __str__(self):
return(str(self.status))
G = nx.Graph()
for i in xrange(Patches):
Stat = 1 if np.random.uniform() < P_init else 0
Pos = (xcoord[i], ycoord[i], zcoord[i])
G.add_node(patch(Stat,Pos))
for i in G.nodes():
for j in G.nodes():
if i.pos where i == bobbington0:
if j.pos where j == bobbington1:
G.add_edge(i,j)
pos = {}
for n in G.nodes():
pos[n] = n.pos
occup = [n.status for n in G]
Time = [0]
Occupancy = [np.sum([n.status for n in G])/float(Patches)]
Here bobbington0 is just a column of node indices going from 0 -> 7854 and bibbington1 is the first nearest neighbour for each of those nodes. What is is the best way to go about this? I'm struggling to find anything on this type of problem but I'm probably wording things poorly.
Thanks in advance for any help you can give me.
I've got it. Not particularly elegant but it works.
for i in G.nodes():
for j in G.nodes():
diff1 = j.boba[0] - i.bubu
if diff1 == 0:
G.add_edge(i, j)

Categories

Resources