Related
I'm processing joystick data. There are two time series, one for the joystick's X motion and another for its Y motion. The two data sets have different time stamps. In the end, I hope to use matplotlib to plot a parametric 2D graph of the joystick data (where time is implicit, and the X and Y motion make up the points on the graph). However, before this end goal, I have to "merge" the two time series. For convenience, I'm going to assume that joystick motion is linear between timestamps.
I've coded something that can complete this (see below), but it seems needlessly complex. I'm hoping to find a more simplistic approach to accomplish this linear interpolation if possible.
import numpy as np
import matplotlib.pyplot as plt
# Example data
X = np.array([[0.98092103, 1013],
[1.01400101, 375],
[1.0561214, -8484],
[1.06982589, -17181],
[1.09453125, -16965]])
Y = np.array([[0.98092103, 534],
[1.00847602, 1690],
[1.0392499, -5327],
[1.06982589, -27921],
[1.10026598, -28915]])
data = []
# keep track of which index was used last
current_indices = [-1, -1]
# make ordered list of all timestamps between both data sets, no repeats
all_timestamps = sorted(set(X[:, 0]).union(set(Y[:, 0])))
for ts in all_timestamps:
# for each dimension (X & Y), index where timestamp exists, if timestamp exists. Else None
ts_indices = tuple(indx[0] if len(indx := np.where(Z[:, 0] == ts)[0]) > 0 else None
for Z in (X, Y))
# Out of range timesteps assumed to be zero
ts_vals = [0, 0]
for variable_indx, (current_z_indx, Z) in enumerate(zip(ts_indices, (X, Y))):
last_index_used = current_indices[variable_indx]
if current_z_indx is not None:
# If timestep is present, get value
current_indices[variable_indx] = current_z_indx
ts_vals[variable_indx] = Z[current_z_indx, 1]
elif last_index_used not in (-1, len(Z[:, 0]) - 1):
# If timestep within range of data, linearly interpolate
t0, z0 = Z[last_index_used, :]
t1, z1 = Z[last_index_used + 1, :]
ts_vals[variable_indx] = z0 + (z1 - z0) * (ts - t0) / (t1 - t0)
data.append([ts, *ts_vals])
merged_data = np.array(data)
plt.plot(merged_data[:,1],merged_data[:,2])
plt.show()
You are looking for np.interp to simplify the linear interpolation.
Following your example:
import numpy as np
import matplotlib.pyplot as plt
# Example data
X = np.array([[0.98092103, 1013],
[1.01400101, 375],
[1.0561214, -8484],
[1.06982589, -17181],
[1.09453125, -16965]])
Y = np.array([[0.98092103, 534],
[1.00847602, 1690],
[1.0392499, -5327],
[1.06982589, -27921],
[1.10026598, -28915]])
#extract all timestamps
all_timestamps = sorted(set(X[:, 0]).union(set(Y[:, 0])))
#linear interpolation
valuesX = np.interp(all_timestamps, X[:,0], X[:,1])
valuesY = np.interp(all_timestamps, Y[:,0], Y[:,1])
#plotting
plt.plot(valuesX, valuesY)
plt.show()
I try to implement Polynomial Regression with Gradient Descent. I want to fit the following function:
The code I use is:
import numpy as np
import matplotlib.pyplot as plt
import scipy.linalg
from sklearn.preprocessing import PolynomialFeatures
np.random.seed(seed=42)
def create_data():
x = PolynomialFeatures(degree=5).fit_transform(np.linspace(-10,10,100).reshape(100,-1))
l = lambda x_i: (1/3)*x_i**3-2*x_i**2+2*x_i+2
data = l(x[:,1])
noise = np.random.normal(0,0.1,size=np.shape(data))
y = data+noise
y= y.reshape(100,1)
return {'x':x,'y':y}
def plot_function(x,y):
fig = plt.figure(figsize=(10,10))
plt.plot(x[:,1],[(1/3)*x_i**3-2*x_i**2+2*x_i+2 for x_i in x[:,1]],c='lightgreen',linewidth=3,zorder=0)
plt.scatter(x[:,1],y)
plt.show()
def w_update(y,x,batch,w_old,eta):
derivative = np.sum([(y[i]-np.dot(w_old.T,x[i,:]))*x[i,:] for i in range(np.shape(x)[0])])
print(derivative)
return w_old+eta*(1/batch)*derivative
# initialize variables
w = np.random.normal(size=(6,1))
data = create_data()
x = data['x']
y = data['y']
plot_function(x,y)
# Update w
w_s = []
Error = []
for i in range(500):
error = (1/2)*np.sum([(y[i]-np.dot(w.T,x[i,:]))**2 for i in range(len(x))])
Error.append(error)
w_prime = w_update(y,x,np.shape(x)[0],w,0.001)
w = w_prime
w_s.append(w)
# Plot the predicted function
plt.plot(x[:,1],np.dot(x,w))
plt.show()
# Plot the error
fig3 = plt.figure()
plt.scatter(range(len(Error[10:])),Error[10:])
plt.show()
But as result I receive smth. strange which is completely out of bounds...I have also tried to alter the number of iterations as well as the parameter theta but it did not help. I assume I have made an mistake in the update of w.
I have found the solution. The Problem is indeed in the part where I calculate the weights. Specifically in:
np.sum([(y[d]-np.dot(w_old.T,x[d,:]))*x[d,:] for d in range(np.shape(x)[0])])
which should be like:
np.sum([-(y[d]-np.dot(w.T.copy(),x[d,:]))*x[d,:].reshape(np.shape(w)) for d in range(len(x))],axis=0)
We have to add np.sum(axis=0) to get the dimensionality we want --> Dimensionality must be equal to w. The numpy sum documentation sais
The default, axis=None, will sum all of the elements of the input
array.
This is not what we want to achieve. Adding axis = 0 sums over the first axis of our array which is of dimensionality (100,7,1) hence the 100 elements of dimensionality (7,1) are summed up and the resulting array is of dimensionality (7,1) which is exactly what we want. Implementing this and cleaning up the code yields:
import numpy as np
import matplotlib.pyplot as plt
import scipy.linalg
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
np.random.seed(seed=42)
def create_data():
x = PolynomialFeatures(degree=6).fit_transform(np.linspace(-2,2,100).reshape(100,-1))
x[:,1:] = MinMaxScaler(feature_range=(-2,2),copy=False).fit_transform(x[:,1:])
l = lambda x_i: np.cos(0.8*np.pi*x_i)
data = l(x[:,1])
noise = np.random.normal(0,0.1,size=np.shape(data))
y = data+noise
y= y.reshape(100,1)
# Normalize Data
return {'x':x,'y':y}
def plot_function(x,y,w,Error,w_s):
fig,ax = plt.subplots(nrows=1,ncols=2,figsize=(40,10))
ax[0].plot(x[:,1],[np.cos(0.8*np.pi*x_i) for x_i in x[:,1]],c='lightgreen',linewidth=3,zorder=0)
ax[0].scatter(x[:,1],y)
ax[0].plot(x[:,1],np.dot(x,w))
ax[0].set_title('Function')
ax[1].scatter(range(iterations),Error)
ax[1].set_title('Error')
plt.show()
# initialize variables
data = create_data()
x = data['x']
y = data['y']
w = np.random.normal(size=(np.shape(x)[1],1))
eta = 0.1
iterations = 10000
batch = 10
def stochastic_gradient_descent(x,y,w,eta):
derivative = -(y-np.dot(w.T,x))*x.reshape(np.shape(w))
return eta*derivative
def batch_gradient_descent(x,y,w,eta):
derivative = np.sum([-(y[d]-np.dot(w.T.copy(),x[d,:]))*x[d,:].reshape(np.shape(w)) for d in range(len(x))],axis=0)
return eta*(1/len(x))*derivative
def mini_batch_gradient_descent(x,y,w,eta,batch):
gradient_sum = np.zeros(shape=np.shape(w))
for b in range(batch):
choice = np.random.choice(list(range(len(x))))
gradient_sum += -(y[choice]-np.dot(w.T,x[choice,:]))*x[choice,:].reshape(np.shape(w))
return eta*(1/batch)*gradient_sum
# Update w
w_s = []
Error = []
for i in range(iterations):
# Calculate error
error = (1/2)*np.sum([(y[i]-np.dot(w.T,x[i,:]))**2 for i in range(len(x))])
Error.append(error)
# Stochastic Gradient Descent
"""
for d in range(len(x)):
w-= stochastic_gradient_descent(x[d,:],y[d],w,eta)
w_s.append(w.copy())
"""
# Minibatch Gradient Descent
"""
w-= mini_batch_gradient_descent(x,y,w,eta,batch)
"""
# Batch Gradient Descent
w -= batch_gradient_descent(x,y,w,eta)
# Show predicted weights
print(w_s)
# Plot the predicted function and the Error
plot_function(x,y,w,Error,w_s)
As result we receive:
Which surely can be improved by altering eta and the number of iterations as well as switching to Stochastic or Mini Batch Gradient Descent or more sophisticated optimization algorithms.
I'm trying to reshape a numpy array [link] then reshape that array again, but am not able to achieve my desired result. My data starts in shape (n_vertices, n_time, n_dimensions). I then transform it into shape (n_time, n_vertices * n_dimensions):
import numpy as np
X = np.load('dance.npy')
n_vertices, n_time, n_dims = X.shape
X = X.reshape(n_time, n_vertices * n_dims)
By visualizing the data, I can see that the transformation above does not distort the internal values:
import mpl_toolkits.mplot3d.axes3d as p3
from mpl_toolkits.mplot3d.art3d import juggle_axes
import matplotlib.pyplot as plt
from IPython.display import HTML
from matplotlib import animation
import matplotlib
matplotlib.rcParams['animation.embed_limit'] = 2**128
def update_points(time, points, df):
points._offsets3d = juggle_axes(df[:,time,0], df[:,time,1], df[:,time,2], 'z')
def get_plot(df, lim=1, frames=200, duration=45, time_axis=1, reshape=False):
if reshape: df = df.reshape(n_vertices, df.shape[time_axis], n_dims)
fig = plt.figure()
ax = p3.Axes3D(fig)
ax.set_xlim(-lim, lim)
ax.set_ylim(-lim, lim)
ax.set_zlim(-lim, lim)
points = ax.scatter(df[:,0,0], df[:,0,1], df[:,0,2], depthshade=False) # x,y,z vals
return animation.FuncAnimation(fig, update_points, frames, interval=duration, fargs=(points, df), blit=False ).to_jshtml()
HTML(get_plot(X, frames=200, time_axis=0, reshape=True))
This shows the data in motion (the vertices are body parts of a dancer, and the visualization looks like a human body). This is all good. However, when I try to visualize just the first 10 time slices of the data, the resulting plot does not show the first few frames of the visualization above -- the form is in fact not human shaped:
HTML(get_plot(X[:20], frames=10, time_axis=0, reshape=True))
Can anyone help me understand why this slicing operation does not match the first few time frames of X? Any suggestions or observations would be very helpful.
It turns out that my reshaping operations weren't manipulating my arrays as I thought they were. The following functions reshape my original array X into the flattened form (with two axes) then back to the unflattened form (with three axes) properly. I added comments and tests to make sure all was as it is expected to be:
from math import floor
def flatten(df, run_tests=True):
'''
df is a numpy array with the following three axes:
df.shape[0] = the index of a vertex
df.shape[1] = the index of a time stamp
df.shape[2] = the index of a dimension (x, y, z)
So df[1][0][2] is the value for the 1st vertex (0-based) at time 0 in dimension 2 (z).
To flatten this dataframe will mean to push the data into shape:
flattened.shape[0] = time index
flattened.shape[1] = [vertex_index*3] + dimension_vertex
So flattened[1][3] will be the 3rd dimension of the 1st index (0-based) at time 1.
'''
if run_tests:
assert df.shape == X.shape and np.all(df == X)
# reshape X such that flattened.shape = time, [x0, y0, z0, x1, y1, z1, ... xn-1, yn-1, zn-1]
flattened = X.swapaxes(0, 1).reshape( (df.shape[1], df.shape[0] * df.shape[2]), order='C' )
if run_tests: # switch to false to skip tests
for idx, i in enumerate(df):
for jdx, j in enumerate(df[idx]):
for kdx, k in enumerate(df[idx][jdx]):
assert flattened[jdx][ (idx*df.shape[2]) + kdx ] == df[idx][jdx][kdx]
return flattened
And to unflatten the flattened data:
def unflatten(df, run_tests=True):
'''
df is a numpy array with the following two axes:
df.shape[0] = time index
df.shape[1] = [vertex_index*3] + dimension_vertex
To unflatten this dataframe will mean to push the data into shape:
unflattened.shape[0] = the index of a vertex
unflattened.shape[1] = the index of a time stamp
unflattened.shape[2] = the index of a dimension (x, y, z)
So df[2][4] == unflattened[1][2][0]
'''
if run_tests:
assert (len(df.shape) == 2) and (df.shape[1] == X.shape[0] * X.shape[2])
unflattened = np.zeros(( X.shape[0], df.shape[0], X.shape[2] ))
for idx, i in enumerate(df):
for jdx, j in enumerate(df[idx]):
kdx = floor(jdx / 3)
ldx = jdx % 3
unflattened[kdx][idx][ldx] = df[idx][jdx]
if run_tests: # set to false to skip tests
for idx, i in enumerate(unflattened):
for jdx, j in enumerate(unflattened[idx]):
for kdx, k in enumerate(unflattened[idx][jdx]):
assert( unflattened[idx][jdx][kdx] == X[idx][jdx][kdx] )
return unflattened
Then to visualize:
import mpl_toolkits.mplot3d.axes3d as p3
from mpl_toolkits.mplot3d.art3d import juggle_axes
import matplotlib.pyplot as plt
from IPython.display import HTML
from matplotlib import animation
import matplotlib
# ask matplotlib to plot up to 2^128 frames in animations
matplotlib.rcParams['animation.embed_limit'] = 2**128
def update_points(time, points, df):
points._offsets3d = juggle_axes(df[:,time,0], df[:,time,1], df[:,time,2], 'z')
def get_plot(df, lim=1, frames=200, duration=45):
if len(df.shape) == 2: df = unflatten(df)
fig = plt.figure()
ax = p3.Axes3D(fig)
ax.set_xlim(-lim, lim)
ax.set_ylim(-lim, lim)
ax.set_zlim(-lim, lim)
points = ax.scatter(df[:,0,0], df[:,0,1], df[:,0,2], depthshade=False) # x,y,z vals
return animation.FuncAnimation(fig,
update_points,
frames,
interval=duration,
fargs=(points, df),
blit=False
).to_jshtml()
HTML(get_plot(unflat, frames=200))
This allows me to slice my time axis without problem:
flat = flatten(X)
unflat = unflatten(flat)
HTML(get_plot(unflat, frames=200))
HTML(get_plot(flat[:20], frames=20))
HTML(get_plot(unflat[:,:20,:], frames=20))
Explanation:
I have two numpy arrays: dataX and dataY, and I am trying to filter each array to reduce the noise. The image shown below shows the actual input data (blue dots) and an example of what I want it to be like(red dots). I do not need the filtered data to be as perfect as in the example but I do want it to be as straight as possible. I have provided sample data in the code.
What I have tried:
Firstly, you can see that the data isn't 'continuous', so I first divided them into individual 'segments' ( 4 of them in this example), and then applied a filter to each 'segment'. Someone suggested that I use a Savitzky-Golay filter. The full, run-able code is below:
import scipy as sc
import scipy.signal
import numpy as np
import matplotlib.pyplot as plt
# Sample Data
ydata = np.array([1,0,1,2,1,2,1,0,1,1,2,2,0,0,1,0,1,0,1,2,7,6,8,6,8,6,6,8,6,6,8,6,6,7,6,5,5,6,6, 10,11,12,13,12,11,10,10,11,10,12,11,10,10,10,10,12,12,10,10,17,16,15,17,16, 17,16,18,19,18,17,16,16,16,16,16,15,16])
xdata = np.array([1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32,33, 1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32])
# Used a diff array to find where there is a big change in Y.
# If there's a big change in Y, then there must be a change of 'segment'.
diffy = np.diff(ydata)
# Create empty numpy arrays to append values into
filteredX = np.array([])
filteredY = np.array([])
# Chose 3 to be the value indicating the change in Y
index = np.where(diffy >3)
# Loop through the array
start = 0
for i in range (0, (index[0].size +1) ):
# Check if last segment is reached
if i == index[0].size:
print xdata[start:]
partSize = xdata[start:].size
# Window length must be an odd integer
if partSize % 2 == 0:
partSize = partSize - 1
filteredDataX = sc.signal.savgol_filter(xdata[start:], partSize, 3)
filteredDataY = sc.signal.savgol_filter(ydata[start:], partSize, 3)
filteredX = np.append(filteredX, filteredDataX)
filteredY = np.append(filteredY, filteredDataY)
else:
print xdata[start:index[0][i]]
partSize = xdata[start:index[0][i]].size
if partSize % 2 == 0:
partSize = partSize - 1
filteredDataX = sc.signal.savgol_filter(xdata[start:index[0][i]], partSize, 3)
filteredDataY = sc.signal.savgol_filter(ydata[start:index[0][i]], partSize, 3)
start = index[0][i]
filteredX = np.append(filteredX, filteredDataX)
filteredY = np.append(filteredY, filteredDataY)
# Plots
plt.plot(xdata,ydata, 'bo', label = 'Input Data')
plt.plot(filteredX, filteredY, 'ro', label = 'Filtered Data')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Result')
plt.legend()
plt.show()
This is my result:
When each point is connected, the result looks as follows.
I have played around with the order, but it seems like a third order gave the best result.
I have also tried these filters, among a few others:
scipy.signal.medfilt
scipy.ndimage.filters.uniform_filter1d
But so far none of the filters I have tried were close to what I really wanted. What is the best way to filter data such as this? Looking forward to your help.
One way to get something looking close to your ideal would be clustering + linear regression.
Note that you have to provide the number of clusters and I also cheated a bit in scaling up y before clustering.
import numpy as np
from scipy import cluster, stats
ydata = np.array([1,0,1,2,1,2,1,0,1,1,2,2,0,0,1,0,1,0,1,2,7,6,8,6,8,6,6,8,6,6,8,6,6,7,6,5,5,6,6, 10,11,12,13,12,11,10,10,11,10,12,11,10,10,10,10,12,12,10,10,17,16,15,17,16, 17,16,18,19,18,17,16,16,16,16,16,15,16])
xdata = np.array([1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32,33, 1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32])
def split_to_lines(x, y, k):
yo = np.empty_like(y, dtype=float)
# get the cluster centers and the labels for each point
centers, map_ = cluster.vq.kmeans2(np.array((x, y * 2)).T.astype(float), k)
# for each cluster, use the labels to select the points belonging to
# the cluster and do a linear regression
for i in range(k):
slope, interc, *_ = stats.linregress(x[map_==i], y[map_==i])
# use the regression parameters to construct y values on the
# best fit line
yo[map_==i] = x[map_==i] * slope + interc
return yo
import pylab
pylab.plot(xdata, ydata, 'or')
pylab.plot(xdata, split_to_lines(xdata, ydata, 4), 'ob')
pylab.show()
I have clustered my data (12000, 3) using sklearn Gaussian mixture model algorithm (GMM). I have 3 clusters. Each point of my data represents a molecular structure. I would like to know how could I sampled each cluster. I have tried with the function:
gmm = GMM(n_components=3).fit(Data)
gmm.sample(n_samples=20)
but it does preform a sampling of the whole distribution, but I need a sample of each one of the components.
Well this is not that easy since you need to calculate the eigenvectors of all covariance matrices. Here is some example code for a problem I studied
import numpy as np
from scipy.stats import multivariate_normal
import random
from operator import truediv
import itertools
from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import mixture
#import some data which can be used for gmm
mix = np.loadtxt("mixture.txt", usecols=(0,1), unpack=True)
#print(mix.shape)
color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold',
'darkorange'])
def plot_results(X, Y_, means, covariances, index, title):
#function for plotting the gaussians
splot = plt.subplot(2, 1, 1 + index)
for i, (mean, covar, color) in enumerate(zip(
means, covariances, color_iter)):
v, w = linalg.eigh(covar)
v = 2. * np.sqrt(2.) * np.sqrt(v)
u = w[0] / linalg.norm(w[0])
# as the DP will not use every component it has access to
# unless it needs it, we shouldn't plot the redundant
# components.
if not np.any(Y_ == i):
continue
plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
# Plot an ellipse to show the Gaussian component
angle = np.arctan(u[1] / u[0])
angle = 180. * angle / np.pi # convert to degrees
ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
ell.set_clip_box(splot.bbox)
ell.set_alpha(0.5)
splot.add_artist(ell)
plt.xlim(-4., 3.)
plt.ylim(-4., 2.)
gmm = mixture.GaussianMixture(n_components=3, covariance_type='full').fit(mix.T)
print(gmm.predict(mix.T))
plot_results(mix.T, gmm.predict(mix.T), gmm.means_, gmm.covariances_, 0,
'Gaussian Mixture')
So for my problem the resulting plot looked like this:
Edit: here the answer to your comment. I would use pandas to do this. Assume X is your feature matrix and y are your labels, then
import pandas as pd
y_pred = gmm.predict(X)
df_all_info = pd.concat([X,y,y_pred], axis=1)
In the resulting dataframe you can check all the information you want, you can even just exclude the samples the algorithm misclassified with:
df_wrong = df_all_info[df_all_info['name of y-column'] != df_all_info['name of y_pred column']]