Eliminating Certain Values in Dataframe

Eliminating Certain Values in Dataframe - python

Initial Data
d = {'RedVal':[1,1.1,2,1.5,1.7,2,1,1.1,2,1,1.1,2,2.6,2.5,2.4,2.5], 'GreenVal':[1,1.1,1.1,1,1.1,1.7,1,1.1,1.5,1,1.9,3,2.8,2.7,2.6,2.5],'Frame':[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3],'Particle':[0,0,0,0,2,2,2,2,3,3,3,3,4,4,4,4] }
testframe = pd.DataFrame(data=d)
testframe
framenot = 2 #set how many frames you would like to get initial ratio for
ratarray = [] #initialize blank ratio array
testframe.sort_values(by =[ 'Particle', 'Frame'])
for particle in range(0,5):
if(testframe['Particle']== particle).any() == False:
particle = particle + 1
else:
newframe = testframe.loc[(testframe['Frame']<= framenot) & (testframe['Particle'] == particle)]
#print(particle)
for i in range(framenot):
#print(i)
GVal = newframe['GreenVal'].values[i]
RVal = newframe['RedVal'].values[i]
ratio = RVal/GVal
#print(RVal)
#print(GVal)
#print(ratio)
ratarray.append(ratio)
i+=1
#print(ratarray)
particle+=1
ratarray = np.array(ratarray)
avgRatios = np.average(ratarray.reshape(-1,framenot), axis = 1)
stdRatios = np.std(ratarray.reshape(-1,framenot), axis = 1)
print(avgRatios) #array with average ratios over set frames starting from initial particle
print(stdRatios)
So far I have code that gives the avg and standard deviation for each particle's ratio of Red/Green over the frames 0 and 1. Now I want to compare this avg ratio to the ratio for the next x frames and eliminate particles where the subsequent frames ratios falls outside the avg+2stdev. Not quite sure how to do this. Any help is appreciated.

Related

How to Maintain Segmented Cell Numbering On Moving Tissue

The overall goal of the project is to identify cells that are undergoing a wave-like event and determine if cell area changes after the event occurs (red arrow). The raw file is a video of live frog epithelial tissue that has been exported frame-by-frame into multiple .tiff files. The image below is just one of the many frames used.
(https://i.stack.imgur.com/iFfYR.jpg)
Using Cellpose we have already been able to segment each frame of the video and determine the centroids. The image below is saved as .npy file as indicated in the code.
(https://i.stack.imgur.com/7XP0j.jpg)
We then go on to number each cell and calculate the area across all the frames we used.
(https://i.stack.imgur.com/xX7pY.png) (https://i.stack.imgur.com/JcvPj.png)
As you might be able to tell, this is where we run into our issues. Since we are using live epithelial tissue, there is movement in the original video file. As new cells come into frame throughout our raw video, the numbering becomes completely different. This means that the data compiled becomes virtually useless as we cannot tell which cell is the same between different frames (see image). Another (less pressing) issue is that the wave event is almost always identified as a cell by the Cellpose segmentation program regardless of how strict the threshold is set to.
(https://i.stack.imgur.com/zmvI6.png)
def graphData(filepath, oneImage, maxDist, minDist, savePath, length):
# graph data for all files in a folder
if oneImage == False:
for file in os.listdir(filepath):
filename = os.fsdecode(file)
temp = filepath + filename
if filename.endswith(".npy"):
a = len(filename)-7
save = savePath+filename[:a]+".tif"
graphData(temp,True,maxDist,minDist,save,length)
else:
#get data from .npy file
data = np.load(filepath, allow_pickle = True).item()
masks = data['masks']
properties = ('label', 'centroid')
prop_dict = regionprops_table(data['masks'], properties=properties)
centroidsDF = pd.DataFrame(prop_dict)
outlines = data['outlines']
results = []
n = 0
#find cell intersections
for x in range(len(masks)-2):
for y in range(len(masks[0])-2):
count = 0
temp = [0,masks[x][y]]
if masks[x][y] != n and x != 0 and y != 0:
for a in range(5):
for b in range(5):
if masks[x-2+a][y-2+b] not in temp:
count += 1
temp.append(masks[x-2+a][y-2+b])
if count >= 2:
results.append([x,y,count+1])
#remove intersections near the edge of the image
for a in results:
if a[0] < 20 or a[1] < 20:
results.remove(a)
#remove cell intersections that are too close (based on minDist)
for p in range(5):
for a in results:
nearest = 99999
for b in results:
dist = abs(math.sqrt((abs(a[0]-b[0])**2)+(abs(a[1]-b[1])**2)))
if dist <= nearest and dist != 0:
nearest = dist
for b in results:
dist = abs(math.sqrt((abs(a[0]-b[0])**2)+(abs(a[1]-b[1])**2)))
if dist <= minDist and dist == nearest:
results.remove(b)
#combine cell intersections that are too close (based on maxDist)
for p in range(5):
for a in results:
nearest = 99999
for b in results:
dist = abs(math.sqrt((abs(a[0]-b[0])**2)+(abs(a[1]-b[1])**2)))
if dist <= nearest and dist != 0:
nearest = dist
for b in results:
dist = abs(math.sqrt((abs(a[0]-b[0])**2)+(abs(a[1]-b[1])**2)))
if dist == nearest and dist <= maxDist:
avgX = (a[0]+b[0])/2
avgY = (a[1]+b[1])/2
Tcount = a[2]
if dist >= minDist:
Tcount = round((a[2]+b[2]+2)/2)
for i in range(len(results)-1):
if results[i]==a:
results[i]= [avgX, avgY, Tcount]
results.remove(b)
plt.clf()
# graph masks and outlines
for i in range(len(np.unique(masks))):
if i != 0:
plt.scatter(np.where(masks==i)[0],np.where(masks==i)[1],s=0.0001,c='0.9')
for i in range(len(np.unique(masks))):
if i != 0:
plt.scatter(np.where(outlines==i)[0],np.where(outlines==i)[1],s=0.001,c='0.4')
xs = [x[0] for x in results]
ys = [x[1] for x in results]
zs = [x[2] for x in results]
plt.scatter(xs,ys,s=.05,c='k')
#graph number of intersecting cells
#for a in range(len(xs)):
#plt.text(xs[a]+1,ys[a]+1,zs[a],size=3,c='k')
save = savePath+os.path.basename(filepath)[:len(os.path.basename(filepath))-3]+"txt"
areas = []
for x in range(len(np.unique(data['masks']))-1):
count = 0
for a in range(len(masks)):
for b in range(len(masks[0])):
if masks[a][b] == x:
count+=1
areas.append([x,count])
areas.pop(0)
count = 0
for a in areas:
count += a[1]
count /= (len(np.unique(data['masks']))-1)
if(exists(save)):
os.remove(save)
with open(save, 'w') as f:
f.write("average cell area: "+ str(round(count/length,5)))
f.write("\n\nnumber of cells: "+ str(len(np.unique(data['masks']))-1))
f.write("\n\narea of each cell (cell # and area): ")
for a in areas:
f.write("\n"+str(a[0])+", "+str(a[1]/length))
for a in range(len(centroidsDF)):
plt.text(centroidsDF['centroid-0'][a],centroidsDF['centroid-1'][a],centroidsDF['label'][a],size=3)
plt.savefig(savePath, dpi = 1440)
Is there a more efficient code that will help complete our project or does the movement of the tissue make it too difficult?

Waterfall chart python matplotlib

I am having a problem with waterfall. I took this chart from matplotlib site and added my own data frame with 2 simple columns with some integer numbers. My waterfall was produced but without numbers, just empty bars. I am a bit lost and I would appreciate any suggestions.
What I am trying to build is the custom waterfall that takes one dataframe with column names, values, and some values for filters like countries. I haven't found anything like that anywhere so I am trying to build my own.
import numpy as np;
import pandas as pd;
import matplotlib.pyplot as plt;
from matplotlib.ticker import FuncFormatter;
dataset = pd.read_csv('waterfall_test_data.csv')
#Use python 2.7+ syntax to format currency
def money(x, pos):
'The two args are the value and tick position'
return "${:,.0f}".format(x)
formatter = FuncFormatter(money)
#Data to plot. Do not include a total, it will be calculated
index = dataset['columns']
data = dataset['amount']
#Store data and create a blank series to use for the waterfall
trans = pd.DataFrame(data=data,index=index)
blank = trans.amount.cumsum().shift(1).fillna(0)
#Get the net total number for the final element in the waterfall
total = trans.sum().amount
trans.loc["net"]= total
blank.loc["net"] = total
#The steps graphically show the levels as well as used for label placement
step = blank.reset_index(drop=True).repeat(3).shift(-1)
step[1::3] = np.nan
#When plotting the last element, we want to show the full bar,
#Set the blank to 0
blank.loc["net"] = 0
#Plot and label
my_plot = trans.plot(kind='bar', stacked=True, bottom=blank,legend=None, figsize=(15, 5), title="2014 Sales Waterfall")
my_plot.plot(step.index, step.values,'k')
my_plot.set_xlabel("Transaction Types")
#Format the axis for dollars
my_plot.yaxis.set_major_formatter(formatter)
#Get the y-axis position for the labels
y_height = trans.amount.cumsum().shift(1).fillna(0)
#Get an offset so labels don't sit right on top of the bar
max = trans.max()
neg_offset = max / 25
pos_offset = max / 50
plot_offset = int(max / 15)
#Start label loop
loop = 0
for index, row in trans.iterrows():
# For the last item in the list, we don't want to double count
if row['amount'] == total:
y = y_height[loop]
else:
y = y_height[loop] + row['amount']
# Determine if we want a neg or pos offset
if row['amount'] > 0:
y += pos_offset
else:
y -= neg_offset
my_plot.annotate("{:,.0f}".format(row['amount']),(loop,y),ha="center")
loop+=1
#Scale up the y axis so there is room for the labels
my_plot.set_ylim(0,blank.max()+int(plot_offset))
#Rotate the labels
my_plot.set_xticklabels(trans.index,rotation=0)
my_plot.get_figure().savefig("waterfall.png",dpi=200,bbox_inches='tight')

Bin average as a function of position

I want to efficiently calculate the average of a variable (say temperature) over multiple areas of the plane.
I essentially want to do the following.
import numpy as np
num = 10000
XYT = np.random.uniform(0, 1, (num, 3))
X = np.transpose(XYT)[0]
Y = np.transpose(XYT)[1]
T = np.transpose(XYT)[2]
size = 10
bins = np.empty((size, size))
for i in range(size):
for j in range(size):
if rescaled X,Y in bin[i][j]:
bins[i][j] = mean T

I would use pandas (although im sure you can achieve basically the same with vanilla numpy)
df = pandas.DataFrame({'x':npX,'y':npY,'z':npZ})
# solve quadrants
df['quadrant'] = (df['x']>=0)*2 + (df['y']>=0)*1
# group by and aggregate
mean_per_quadrant = df.groupby(['quadrant'])['temp'].aggregate(['mean'])
you may need to create multiple quadrant cutoffs to get unique groupings
for example (df['x']>=50)*4 + (df['x']>=0)*2 + (df['y']>=0)*1 would add an extra 2 quadrants to our group (one y>=0, and one y<0) (just make sure you use powers of 2)

K-Means clustering multidimensional data with a heatmap

I have been trying to implement k-means clustering with a heatmap, but have been unsuccessful.
Here is the initial data set:
https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv
And here is my code:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import random
#%matplotlib inline
def truncate(f, n):
return math.floor(f * 10 ** n) / 10 ** n
def chooseCenter(data, centers):
length = data.shape
cent = []
while len(cent) < centers :
x = random.randrange(0,length[0])
y = random.randrange(0,length[1])
if data.iloc[x][y] not in cent:
d = truncate(data.iloc[x][y],2)
cent.append(d)
return cent
def distance(val, center):
return math.sqrt((val- center)**2)
def getDistances(centers, data):
length = data.shape
dist = []
for i in range(length[0]):
for j in range(length[1]):
y = []
for k in range(len(centers)):
val = distance(data.iloc[i][j], centers[k])
y.append(truncate(val,3))
dist.append(y)
return dist
def findClosest(data, dist):
close = data.copy()
length = close.shape
indexes = []
for i in range(len(dist)):
pt = min(dist[i])
idx = dist[i].index(pt)
indexes.append(idx)
#print(indexes)
length = data.shape
n = np.array(indexes)
n = pd.DataFrame(np.reshape(n, (length[0],length[1])))
#reshape this data frame into the same shape as the data
#keep running the find closest until there is no change
#try heatmap on this?
#this should cluster it, but to make sure test it
#might need to do some tweaking to this
return n
# for i in range(length[0]):
# for j in range(length[1]):
# print('dist[i]', dist[j])
# pt = min(dist[j])
# print(pt)
# idx = dist[j].index(pt)
# close.iloc[i][j] = int(idx)
#return close
def computeNewCenter(data, close):
d = dict()
for i in range(len(close)):
for j in range(len(close[0])):
d[close.iloc[i][j]] = []
for i in range(len(data)):
for j in range(len(data[0])):
if close.iloc[i][j] in d:
d[close.iloc[i][j]].append(data.iloc[i][j])
newCenters = []
for key, value in d.items():
m = np.mean(value)
newCenters.append(truncate(m, 3))
return newCenters
# lst = [[] * numcenters]
# for i in range(len(close)):
# for j in range(len(close[0])):
# if close.iloc[i][j]
def main():
data = np.array(pd.read_csv('https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv', header=None))
data = data.T
#print(data)
df = pd.DataFrame(data[1:], columns=data[0], dtype=float).T
df = df.iloc[::-1]
# print(df)
# print(df.iloc[1][9])
# print(df)
# print(df.iloc[0][1])
# heatmap = plt.pcolor(df, cmap=plt.cm.bwr)
# plt.colorbar(heatmap)
c = chooseCenter(df, 3)
print(c)
#print(len(c))
dist = getDistances(c, df)
#print(dist)
y = findClosest(df, dist)
# q = []
# for i in range(len(c)):
# q.append([])
# #print(q)
j = computeNewCenter(df, y)
#print(j)
length = df.shape
oldFrame = pd.DataFrame(np.ndarray((length[0],length[1])))
oldFrame = oldFrame.fillna(0)
ct=0
while y.equals(oldFrame) == False:
ct+=1
oldFrame = y.copy()
c = computeNewCenter(df, oldFrame)
#print(c)
dist = getDistances(c, df)
#print(dist)
y = findClosest(df, dist)
#print(y)
#plt.pcolor(df, cmap=plt.cm.bwr)
l = []
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 1:
l.append(df.iloc[i][j])
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 2:
l.append(df.iloc[i][j])
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 0:
l.append(df.iloc[i][j])
l = np.ndarray((length[0],length[1]))
l = pd.DataFrame(l)
print(l)
hm = plt.pcolor(l, cmap=plt.cm.bwr)
plt.colorbar(hm)
# print(y)
# print(c)
# print(ct)
#plt.pcolor(y, cmap=plt.cm.bwr)
if __name__ == '__main__':
main()
My line of thinking was this:
My current thought process was to first randomly choose the centers.
Then create a list of lists for each point for the distance to each center.
Find the index of the minimum distance for each point for each center.
Create a data frame of the same size as the data set and fill each index for each element with the index of the center the point is closest to.
Recompute the center by taking the mean of the points with the same center index
Repeat this process multiple times until the index data frame does not change.
Create a new data frame and add the points which have the same center point close together in the frame.
Then create the heatmap.
This did not seem to work though.
Just wondering, am I on the right track or am I completely off, and if I am on the right track which parts would I need to change in order to fix the issue. If not could you please point me on the right track.
Here is a comparison of the maps:
Here are the maps
The first one is the one my program generated while the second is the way it is supposed to look.
I know my problem lies in some part of the k-means clustering algorithm, and my guess is it is either in the reassignment stage where you reassign the points to the centroids and calculate the new centroids or in the stopping condition in that the algorithm does not run long enough. Also in the back of my head, something tells me that I am not doing this as efficiently as I could have and that I am missing something key. I have watched several videos on K-means clustering and understand it conceptually, I'm just having a hard time implementing it.

Moving window averages on unequal dimensions

TL;DR: Is there anyway I can get rid of my second for-loop?
I have a time series of points on a 2D-grid. To get rid of fast fluctuations of their position, I average the coordinates over a window of frames. Now in my case, it's possible for the points to cover a larger distance than usual. I don't want to include frames for a specific point, if it travels farther than the cut_off value.
In the first for-loop, I go over all frames and define the moving window. I then calculate the distances between the current frame and each frame in the moving window. After I grab only those positions from all frames, where both the x and y component did not travel farther than cut_off. Now I want to calculate the mean positions for every point from all these selected frames of the moving window (note: the number of selected frames can be smaller than n_window). This leads me to the second for-loop. Here I iterate over all points and actually grab the positions from the frames, in which the current point did not travel farther than cut_off. From these selected frames I calculate the mean value of the coordinates and use it as the new value for the current frame.
This very last for-loop slows down the whole processing. I can't come up with a better way to accomplish this calculation. Any suggestions?
MWE
Put in comments for clarification.
import numpy as np
# Generate a timeseries with 1000 frames, each
# containing 50 individual points defined by their
# x and y coordinates
n_frames = 1000
n_points = 50
n_coordinates = 2
timeseries = np.random.randint(-100, 100, [n_frames, n_points, n_coordinates])
# Set window size to 20 frames
n_window = 20
# Distance cut off
cut_off = 60
# Set up empty array to hold results
avg_data_store = np.zeros([n_frames, timeseries.shape[1], 2])
# Iterate over all frames
for frame in np.arange(0, n_frames):
# Set the frame according to the window size that we're looking at
t_before = int(frame - (n_window / 2))
t_after = int(frame + (n_window / 2))
# If we're trying to access frames below 0, set the lowest one to 0
if t_before < 0:
t_before = 0
# Trying to access frames that are not in the trajectory, set to last frame
if t_after > n_frames - 1:
t_after = n_frames - 1
# Grab x and y coordinates for all points in the corresponding window
pos_before = timeseries[t_before:frame]
pos_after = timeseries[frame + 1:t_after + 1]
pos_now = timeseries[frame]
# Calculate the distance between the current frame and the windows before/after
d_before = np.abs(pos_before - pos_now)
d_after = np.abs(pos_after - pos_now)
# Grab indices of frames+points, that are below the cut off
arg_before = np.argwhere(np.all(d_before < cut_off, axis=2))
arg_after = np.argwhere(np.all(d_after < cut_off, axis=2))
# Iterate over all points
for i in range(0, timeseries.shape[1]):
# Create temp array
temp_stack = pos_now[i]
# Grab all frames in which the current point did _not_
# travel farther than `cut_off`
all_before = arg_before[arg_before[:, 1] == i][:, 0]
all_after = arg_after[arg_after[:, 1] == i][:, 0]
# Grab the corresponding positions for this points in these frames
all_pos_before = pos_before[all_before, i]
all_pos_after = pos_after[all_after, i]
# If we have any frames for that point before / after
# stack them into the temp array
if all_pos_before.size > 0:
temp_stack = np.vstack([all_pos_before, temp_stack])
if all_pos_after.size > 0:
temp_stack = np.vstack([temp_stack, all_pos_after])
# Calculate the moving window average for the selection of frames
avg_data_store[frame, i] = temp_stack.mean(axis=0)

If you are fine with calculating the cutoff distance in x and y separately, you can use scipy.ndimage.generic_filter.
import numpy as np
from scipy.ndimage import generic_filter
def _mean(x, cutoff):
is_too_different = np.abs(x - x[len(x) / 2]) > cutoff
return np.mean(x[~is_too_different])
def _smooth(x, window_length=5, cutoff=1.):
return generic_filter(x, _mean, size=window_length, mode='nearest', extra_keywords=dict(cutoff=cutoff))
def smooth(arr, window_length=5, cutoff=1., axis=-1):
return np.apply_along_axis(_smooth, axis, arr, window_length=window_length, cutoff=cutoff)
# --------------------------------------------------------------------------------
def _simulate_movement_2d(T, fraction_is_jump=0.01):
# generate random velocities with a few "jumps"
velocity = np.random.randn(T, 2)
is_jump = np.random.rand(T) < fraction_is_jump
jump = 10 * np.random.randn(T, 2)
jump[~is_jump] = 0.
# pre-allocate position and momentum arrays
position = np.zeros((T,2))
momentum = np.zeros((T,2))
# initialise the first position
position[0] = np.random.randn(2)
# update position using velocity vector:
# smooth movement by not applying the velocity directly
# but rather by keeping track of the momentum
for ii in range(2,T):
momentum[ii] = 0.9 * momentum[ii-1] + 0.1 * velocity[ii-1]
position[ii] = position[ii-1] + momentum[ii] + jump[ii]
# add some measurement noise
noise = np.random.randn(T,2)
position += noise
return position
def demo(nframes=1000, npoints=3):
# create data
positions = np.array([_simulate_movement_2d(nframes) for ii in range(npoints)])
# format to (nframes, npoints, 2)
position = positions.transpose([1, 0, 2])
# smooth
smoothed = smooth(positions, window_length=11, cutoff=5., axis=1)
# plot
x, y = positions.T
xs, ys = smoothed.T
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,1)
ax.plot(x, y, 'o')
ax.plot(xs, ys, 'k-', alpha=0.3, lw=2)
plt.show()
demo()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Eliminating Certain Values in Dataframe - python

Related

How to Maintain Segmented Cell Numbering On Moving Tissue

Waterfall chart python matplotlib

Bin average as a function of position

K-Means clustering multidimensional data with a heatmap

Moving window averages on unequal dimensions

Categories

Resources