Numpy: Reshaped arrays behaving strangely - python

I'm trying to reshape a numpy array [link] then reshape that array again, but am not able to achieve my desired result. My data starts in shape (n_vertices, n_time, n_dimensions). I then transform it into shape (n_time, n_vertices * n_dimensions):
import numpy as np
X = np.load('dance.npy')
n_vertices, n_time, n_dims = X.shape
X = X.reshape(n_time, n_vertices * n_dims)
By visualizing the data, I can see that the transformation above does not distort the internal values:
import mpl_toolkits.mplot3d.axes3d as p3
from mpl_toolkits.mplot3d.art3d import juggle_axes
import matplotlib.pyplot as plt
from IPython.display import HTML
from matplotlib import animation
import matplotlib
matplotlib.rcParams['animation.embed_limit'] = 2**128
def update_points(time, points, df):
points._offsets3d = juggle_axes(df[:,time,0], df[:,time,1], df[:,time,2], 'z')
def get_plot(df, lim=1, frames=200, duration=45, time_axis=1, reshape=False):
if reshape: df = df.reshape(n_vertices, df.shape[time_axis], n_dims)
fig = plt.figure()
ax = p3.Axes3D(fig)
ax.set_xlim(-lim, lim)
ax.set_ylim(-lim, lim)
ax.set_zlim(-lim, lim)
points = ax.scatter(df[:,0,0], df[:,0,1], df[:,0,2], depthshade=False) # x,y,z vals
return animation.FuncAnimation(fig, update_points, frames, interval=duration, fargs=(points, df), blit=False ).to_jshtml()
HTML(get_plot(X, frames=200, time_axis=0, reshape=True))
This shows the data in motion (the vertices are body parts of a dancer, and the visualization looks like a human body). This is all good. However, when I try to visualize just the first 10 time slices of the data, the resulting plot does not show the first few frames of the visualization above -- the form is in fact not human shaped:
HTML(get_plot(X[:20], frames=10, time_axis=0, reshape=True))
Can anyone help me understand why this slicing operation does not match the first few time frames of X? Any suggestions or observations would be very helpful.

It turns out that my reshaping operations weren't manipulating my arrays as I thought they were. The following functions reshape my original array X into the flattened form (with two axes) then back to the unflattened form (with three axes) properly. I added comments and tests to make sure all was as it is expected to be:
from math import floor
def flatten(df, run_tests=True):
'''
df is a numpy array with the following three axes:
df.shape[0] = the index of a vertex
df.shape[1] = the index of a time stamp
df.shape[2] = the index of a dimension (x, y, z)
So df[1][0][2] is the value for the 1st vertex (0-based) at time 0 in dimension 2 (z).
To flatten this dataframe will mean to push the data into shape:
flattened.shape[0] = time index
flattened.shape[1] = [vertex_index*3] + dimension_vertex
So flattened[1][3] will be the 3rd dimension of the 1st index (0-based) at time 1.
'''
if run_tests:
assert df.shape == X.shape and np.all(df == X)
# reshape X such that flattened.shape = time, [x0, y0, z0, x1, y1, z1, ... xn-1, yn-1, zn-1]
flattened = X.swapaxes(0, 1).reshape( (df.shape[1], df.shape[0] * df.shape[2]), order='C' )
if run_tests: # switch to false to skip tests
for idx, i in enumerate(df):
for jdx, j in enumerate(df[idx]):
for kdx, k in enumerate(df[idx][jdx]):
assert flattened[jdx][ (idx*df.shape[2]) + kdx ] == df[idx][jdx][kdx]
return flattened
And to unflatten the flattened data:
def unflatten(df, run_tests=True):
'''
df is a numpy array with the following two axes:
df.shape[0] = time index
df.shape[1] = [vertex_index*3] + dimension_vertex
To unflatten this dataframe will mean to push the data into shape:
unflattened.shape[0] = the index of a vertex
unflattened.shape[1] = the index of a time stamp
unflattened.shape[2] = the index of a dimension (x, y, z)
So df[2][4] == unflattened[1][2][0]
'''
if run_tests:
assert (len(df.shape) == 2) and (df.shape[1] == X.shape[0] * X.shape[2])
unflattened = np.zeros(( X.shape[0], df.shape[0], X.shape[2] ))
for idx, i in enumerate(df):
for jdx, j in enumerate(df[idx]):
kdx = floor(jdx / 3)
ldx = jdx % 3
unflattened[kdx][idx][ldx] = df[idx][jdx]
if run_tests: # set to false to skip tests
for idx, i in enumerate(unflattened):
for jdx, j in enumerate(unflattened[idx]):
for kdx, k in enumerate(unflattened[idx][jdx]):
assert( unflattened[idx][jdx][kdx] == X[idx][jdx][kdx] )
return unflattened
Then to visualize:
import mpl_toolkits.mplot3d.axes3d as p3
from mpl_toolkits.mplot3d.art3d import juggle_axes
import matplotlib.pyplot as plt
from IPython.display import HTML
from matplotlib import animation
import matplotlib
# ask matplotlib to plot up to 2^128 frames in animations
matplotlib.rcParams['animation.embed_limit'] = 2**128
def update_points(time, points, df):
points._offsets3d = juggle_axes(df[:,time,0], df[:,time,1], df[:,time,2], 'z')
def get_plot(df, lim=1, frames=200, duration=45):
if len(df.shape) == 2: df = unflatten(df)
fig = plt.figure()
ax = p3.Axes3D(fig)
ax.set_xlim(-lim, lim)
ax.set_ylim(-lim, lim)
ax.set_zlim(-lim, lim)
points = ax.scatter(df[:,0,0], df[:,0,1], df[:,0,2], depthshade=False) # x,y,z vals
return animation.FuncAnimation(fig,
update_points,
frames,
interval=duration,
fargs=(points, df),
blit=False
).to_jshtml()
HTML(get_plot(unflat, frames=200))
This allows me to slice my time axis without problem:
flat = flatten(X)
unflat = unflatten(flat)
HTML(get_plot(unflat, frames=200))
HTML(get_plot(flat[:20], frames=20))
HTML(get_plot(unflat[:,:20,:], frames=20))

Related

Obtaining an isosurface from 3D data and the corresponding indices

I have a 3D numpy array of temperature values on a grid. From this I can compute the gradients using dTdx, dTdy, dYdz = np.gradient(T). Now I'm only interested in the values of the gradients on the isosurface where the temperature is 900. What I want to do is something like (pseudo-codish):
import nympy as np
def regular(x,y,z,q=100,k=175,a=7.1e-5):
R = np.sqrt(x**2+y**2+z**2)
return 100 / (2*np.pi*k) * (1/R) * np.exp(-0.5/a*(R+x))
x = np.arange(-1.5,0.5+res/2,res)*1e-3
y = np.arange(-1.0,1.0+res/2,res)*1e-3
z = np.arange(0.0,0.5+res/2,res)*1e-3
Y,X,Z = np.meshgrid(y,x,z)
T = regular(X,Y,Z)
dTdx, dTdy, dYdz = np.gradient(T)
(xind,yind,zind) = <package>.get_contour_indices(X,Y,Z,T,value=900)
x_gradients_at_isosurface = dTdx[xind,yind,zind]
...
I've tried:
import numpy as np
from skimage import measure
contour_data = measure.find_contours(T[:,:,0],900)
contour_data = np.int_(np.round(contour_data[0]))
xs,ys = contour_data[:,0],contour_data[:,1]
gradients_of_interest = np.array([G[x,y,0] for x,y in zip( xs,ys )])
which works fine, but only works for 2D data. I'm looking for the 3D equivalent. I've found the following:
import plotly.graph_objects as go
surf = go.Isosurface(x=X.flatten(),y=Y.flatten(),z=Z.flatten(),value=T.flatten(),isomin=900,isomax=900)
fig = go.Figure(data=surf)
plt.show()
But I'm not interested in plotting it. I want to know the indices where the temperature is T=900 so I can use it on the gradients. Any ideas?
You need skimage.measure.marching_cubes.

How to call the value corresponding to a coordinate in a 2D mesh (2D bin)?

I have a temperature corresponding to a coordinate. Over the fixed area, I want to include a square grid (all the grids must have the same length). For this end I use numpy.meshgrid to generate the cells over my entire area. Now my question is how to sum up the temperature of each row whose coordinate are in kth cell? I am a bit confused as should I use the numpy.histogram2d? It is giving me the frequency of X and Y, does it mean I have to use multidimensional histogram?
Many thanks in advance!
import pandas as pd
import numpy as np
####generating input data frame
df = pd.DataFrame(data = np.random.randint(2000, 6000, (1000000, 3)))
df.columns= ['X','Y', 'Temp']
x2 = np.linspace(df['X'].min(),df['X'].max(), 20)
y2 = np.linspace(df['Y'].min(),df['Y'].max(), 20 )
xx, yy = np.meshgrid(x2, y2, indexing ='ij')
plt.scatter(xx, yy, color="red", marker="x");
#### Or should I use
Hist, xedges, yedges = np.histogram2d(df['X'], df['Y'], bins = (x2,y2))
H = Hist.T
This takes your data set, and produces a 20x20 array that contains the average temperature of all the points within that grid. If there are no temps in a grid, it will produce a NaN:
import numpy as np
data = np.random.randint(2000, 6000, (100000, 3))
# We divide the coordinate space up into 20 bins.
binsize = (6000-2000) // 20
bins = np.zeros((20,20))
counts = np.zeros((20,20))
for row in data:
binx = (row[0] - 2000) // binsize
biny = (row[1] - 2000) // binsize
bins[biny,binx] += row[2]
counts[biny,binx] += 1
print( bins )
print( counts )
print( "Averages:" )
print( bins / counts )

graphing non-linear decision boundary

data can be found here: ex2data2.txt
I'm not sure what call to plt.contour() I should be using to reproduce this.
the related Matlab function call would be:
contour(u, v, z, [0, 0], 'LineWidth', 2)
I'm trying to plot the decision boundary for a non-linear logistic regression like the following image
import scikitplot.plotters as skplt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn import metrics
from ggplot import *
import time
def mapFeature(X1, X2, df=True):
"""
X1, X2: dtype = pd.DataFrame, float, int
either a single value or a vector of values
df : dtype = boolean
whether it's a single scalar value or a vector of values
----------
Return: dtype = m row vector or m x n vector of feature values
Calculates each feature and returns its value
"""
# add a column of ones for intercept parameter
out = pd.DataFrame({'1':np.ones(X1.size)})
# max 6th degree polynomial
for i in range(1,7):
for j in range(i+1):
# all the combinations of polynomials up to 7th degree
value = (X1**(i-j))*(X2**j)
col_name = 'X1**{} * X2**{}'.format(i-j, j)
# When we give a vector with only one dimension, we need to specify
# whether to add it as a column or a row. 0 denotes adding a row,
# and 1 would be a column.
if df:
out = out.join(pd.DataFrame({col_name: value}))
else:
out = out.join(pd.DataFrame({col_name: value}, index=[0]))
return out
if __name__ == '__main__':
data = pd.read_csv('ex2data2.txt', header=None,
names=['Test1', 'Test2', 'Pass'])
X = data.iloc[:, :2]
y = data.iloc[:,2]
X = mapFeature(X.iloc[:,0], X.iloc[:,1])
clf = LogisticRegression().fit(X, y)
theta = clf.coef_
u = np.linspace(start, end, 30)
v = np.linspace(start, end, 30)
uu, vv = np.meshgrid(u, v)
z = np.zeros((30, 30))
for i in range(30):
for j in range(30):
z[i,j] = mapFeature(u[i], v[i], df=False).values.dot(theta.T)
plt.contour(uu, vv, z, [0])
plt.show()

Filtering 1D numpy arrays in Python

Explanation:
I have two numpy arrays: dataX and dataY, and I am trying to filter each array to reduce the noise. The image shown below shows the actual input data (blue dots) and an example of what I want it to be like(red dots). I do not need the filtered data to be as perfect as in the example but I do want it to be as straight as possible. I have provided sample data in the code.
What I have tried:
Firstly, you can see that the data isn't 'continuous', so I first divided them into individual 'segments' ( 4 of them in this example), and then applied a filter to each 'segment'. Someone suggested that I use a Savitzky-Golay filter. The full, run-able code is below:
import scipy as sc
import scipy.signal
import numpy as np
import matplotlib.pyplot as plt
# Sample Data
ydata = np.array([1,0,1,2,1,2,1,0,1,1,2,2,0,0,1,0,1,0,1,2,7,6,8,6,8,6,6,8,6,6,8,6,6,7,6,5,5,6,6, 10,11,12,13,12,11,10,10,11,10,12,11,10,10,10,10,12,12,10,10,17,16,15,17,16, 17,16,18,19,18,17,16,16,16,16,16,15,16])
xdata = np.array([1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32,33, 1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32])
# Used a diff array to find where there is a big change in Y.
# If there's a big change in Y, then there must be a change of 'segment'.
diffy = np.diff(ydata)
# Create empty numpy arrays to append values into
filteredX = np.array([])
filteredY = np.array([])
# Chose 3 to be the value indicating the change in Y
index = np.where(diffy >3)
# Loop through the array
start = 0
for i in range (0, (index[0].size +1) ):
# Check if last segment is reached
if i == index[0].size:
print xdata[start:]
partSize = xdata[start:].size
# Window length must be an odd integer
if partSize % 2 == 0:
partSize = partSize - 1
filteredDataX = sc.signal.savgol_filter(xdata[start:], partSize, 3)
filteredDataY = sc.signal.savgol_filter(ydata[start:], partSize, 3)
filteredX = np.append(filteredX, filteredDataX)
filteredY = np.append(filteredY, filteredDataY)
else:
print xdata[start:index[0][i]]
partSize = xdata[start:index[0][i]].size
if partSize % 2 == 0:
partSize = partSize - 1
filteredDataX = sc.signal.savgol_filter(xdata[start:index[0][i]], partSize, 3)
filteredDataY = sc.signal.savgol_filter(ydata[start:index[0][i]], partSize, 3)
start = index[0][i]
filteredX = np.append(filteredX, filteredDataX)
filteredY = np.append(filteredY, filteredDataY)
# Plots
plt.plot(xdata,ydata, 'bo', label = 'Input Data')
plt.plot(filteredX, filteredY, 'ro', label = 'Filtered Data')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Result')
plt.legend()
plt.show()
This is my result:
When each point is connected, the result looks as follows.
I have played around with the order, but it seems like a third order gave the best result.
I have also tried these filters, among a few others:
scipy.signal.medfilt
scipy.ndimage.filters.uniform_filter1d
But so far none of the filters I have tried were close to what I really wanted. What is the best way to filter data such as this? Looking forward to your help.
One way to get something looking close to your ideal would be clustering + linear regression.
Note that you have to provide the number of clusters and I also cheated a bit in scaling up y before clustering.
import numpy as np
from scipy import cluster, stats
ydata = np.array([1,0,1,2,1,2,1,0,1,1,2,2,0,0,1,0,1,0,1,2,7,6,8,6,8,6,6,8,6,6,8,6,6,7,6,5,5,6,6, 10,11,12,13,12,11,10,10,11,10,12,11,10,10,10,10,12,12,10,10,17,16,15,17,16, 17,16,18,19,18,17,16,16,16,16,16,15,16])
xdata = np.array([1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32,33, 1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32])
def split_to_lines(x, y, k):
yo = np.empty_like(y, dtype=float)
# get the cluster centers and the labels for each point
centers, map_ = cluster.vq.kmeans2(np.array((x, y * 2)).T.astype(float), k)
# for each cluster, use the labels to select the points belonging to
# the cluster and do a linear regression
for i in range(k):
slope, interc, *_ = stats.linregress(x[map_==i], y[map_==i])
# use the regression parameters to construct y values on the
# best fit line
yo[map_==i] = x[map_==i] * slope + interc
return yo
import pylab
pylab.plot(xdata, ydata, 'or')
pylab.plot(xdata, split_to_lines(xdata, ydata, 4), 'ob')
pylab.show()

Shifting data in 2d array through shifted indices

I need to shift a 2D array field, i.e. I have a "previous_data" array which I access through shifted indices to create my "new_data" array.
I can do this in a nonpythonic (and slow) loop, but would very much appreciate some help in finding a pythonic (and faster) solution!
Any help and hints are very much appreciated!
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import mpl
def nonpythonic():
#this works, but is slow (for large arrays)
new_data = np.zeros((ny,nx))
for j in xrange(ny):
for i in xrange(nx):
#go through each item, check if it is within the bounds
#and assign the data to the new_data array
i_new = ix[j,i]
j_new = iy[j,i]
if ((i_new>=0) and (i_new<nx) and (j_new>=0) and (j_new<ny)):
new_data[j,i]=previous_data[j_new,i_new]
ef, axar = plt.subplots(1,2)
im = axar[0].pcolor(previous_data, vmin=0,vmax=2)
ef.colorbar(im, ax=axar[0], shrink=0.9)
im = axar[1].pcolor(new_data, vmin=0,vmax=2)
ef.colorbar(im, ax=axar[1], shrink=0.9)
plt.show()
def pythonic():
#tried a few things here, but none are working
#-tried assigning NaNs to indices (ix,iy) which are out of bounds, but NaN's don't work for indices
#-tried masked arrays, but they also don't work as indices
#-tried boolean arrays, but ended in shape mismatches
#just as in the nonworking code below
ind_y_good = np.where(iy>=0) and np.where(iy<ny)
ind_x_good = np.where(ix>=0) and np.where(ix<nx)
new_data = np.zeros((ny,nx))
new_data[ind_y_good,ind_x_good] = previous_data[iy[ind_y_good],ix[ind_x_good]]
#some 2D array:
nx = 20
ny = 30
#array indices:
iy, ix = np.indices((ny,nx))
#modify indices (shift):
iy = iy + 1
ix = ix - 4
#create some out of range indices (which might happen in my real scenario)
iy[0,2:7] = -9999
ix[0:3,-1] = 6666
#some previous data which is the basis for the new_data:
previous_data = np.ones((ny,nx))
previous_data[2:8,10:20] = 2
nonpythonic()
pythonic()
This is the result of the working (nonpythonic) code above:
I implemented a version of pythonic that replicates nonpythonic with some masking and index fiddling - see below. By the way I think the "new" indices should be the ones corresponding to the new array, rather than the old ones, but I've left it as in your existing function.
The main thing to realise is that in your attempt in the question, your conditions
ind_y_good = np.where(iy>=0) and np.where(iy<ny)
ind_x_good = np.where(ix>=0) and np.where(ix<nx)
must be combined, since we must always have pairs of x and y indices. i.e. if the x index is invalid, then so is the y.
Finally, if the indices are really all shifted by a constant factor, you can make this even simpler by using NumPy's roll function and taking a slice of the indices corresponding to the valid area.
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import mpl
def nonpythonic(previous_data, ix, iy, nx, ny):
#this works, but is slow (for large arrays)
new_data = np.zeros((ny,nx))
for j in xrange(ny):
for i in xrange(nx):
#go through each item, check if it is within the bounds
#and assign the data to the new_data array
i_new = ix[j,i]
j_new = iy[j,i]
if ((i_new>=0) and (i_new<nx) and (j_new>=0) and (j_new<ny)):
new_data[j,i]=previous_data[j_new,i_new]
return new_data
def pythonic(previous_data, ix, iy):
ny, nx = previous_data.shape
iy_old, ix_old = np.indices(previous_data.shape)
# note you must apply the same condition to both
# index arrays
valid = (iy >= 0) & (iy < ny) & (ix >= 0) & (ix < nx)
new_data = np.zeros((ny,nx))
new_data[iy_old[valid], ix_old[valid]] = previous_data[iy[valid], ix[valid]]
return new_data
def main():
#some 2D array:
nx = 20
ny = 30
#array indices:
iy, ix = np.indices((ny,nx))
#modify indices (shift):
iy = iy + 1
ix = ix - 4
#create some out of range indices (which might happen in my real scenario)
iy[0,2:7] = -9999
ix[0:3,-1] = 6666
#some previous data which is the basis for the new_data:
previous_data = np.ones((ny,nx))
previous_data[2:8,10:20] = 2
data_nonpythonic = nonpythonic(previous_data, ix, iy, nx, ny)
data_pythonic = pythonic(previous_data, ix, iy)
new_data = data_nonpythonic
ef, axar = plt.subplots(1,2)
im = axar[0].pcolor(previous_data, vmin=0,vmax=2)
ef.colorbar(im, ax=axar[0], shrink=0.9)
im = axar[1].pcolor(new_data, vmin=0,vmax=2)
ef.colorbar(im, ax=axar[1], shrink=0.9)
plt.show()
print(np.allclose(data_nonpythonic, data_pythonic))
if __name__ == "__main__":
main()

Categories

Resources