I have an numpy array I which stores N images of size P (number of pixels). Every image is of size P = q*q.
N = 1000 # number of images
q = 10 # length and width of image
P = q*q # pixels of image
I = np.ones((N,P)) # array with N images of size P
Now I want to delete patches of size ps around a selected index IDX (set all values to zero).
ps = 2 # patch size (ps x ps)
IDX = np.random.randint(0,P,(N,1))
My approach was to reshape every single image using reshape(q,q) and delete the pixels around IDX. Here I have the problem, that I do not know how to compute the position inside the image given IDX. Additionally I have to check if the index is not outside the image.
How to tackle this problem and is there any way to vectorize this procedure?
EDIT:
With the help of #Brenlla I did the following to remove patches. The problem with my approach is, that it needs three for-loops, and I have to reshape every image twice. Is there any way to increase the performance? This part slows down my code significantly.
import numpy as np
import matplotlib.pyplot as plt
def myplot(I):
imgs = 10
for i in range(imgs**2):
plt.subplot(imgs,imgs,(i+1))
plt.imshow(I[i].reshape(q,q), interpolation="none")
plt.axis("off")
plt.show()
N = 10000
q = 28
P = q*q
I = np.random.rand(N,P)
ps = 3
IDX = np.random.randint(0,P,(N,1))
for i in range(N):
img = I[i].reshape(q,q)
y0, x0 = np.unravel_index(IDX[i,0],(q,q))
for x in range(ps):
for y in range(ps):
if (x0+x < q) and (y0+y < q):
img[x0+x,y0+y] = 2.0
I[i] = img.reshape(1,q*q)
myplot(I)
Yes, that can be done, but it involves heavy use of np.broadcasting.
Generate data plus a hard copy of I:
import time
N = 10000
q = 28
P = q*q
ps = 3
I = np.random.rand(N,P)
IDX = np.random.randint(0,P,(N,1))
I_copy = I.copy()
And now run the loop solution. I switched x0 and y0:
t0=time.clock()
for i in range(N):
img = I[i].reshape(q,q)
x0, y0 = np.unravel_index(IDX[i,0],(q,q))
for x in range(ps):
for y in range(ps):
if (x0+x < q) and (y0+y < q):
img[x0+x,y0+y] = 2.0
I[i] = img.reshape(1,q*q)
print('With loop: {:.2f} ms'.format(time.clock()*1e3-t0*1e3))
Approx. 276 ms on my machine. Now the broadcasting:
t0 = time.clock()
x_shift, y_shift = np.meshgrid(range(ps), range(ps))
x, y = np.unravel_index(IDX, (q,q))
#roi for region of interest
roix = x[:,:,None]+x_shift;
roiy = y[:,:,None]+y_shift;
roix[roix>q-1] = q-1; roiy[roiy>q-1] = q-1;
I_copy.reshape(N,q,q)[np.arange(N)[:, None, None], roix, roiy] = 2.0
print('No loop: {:.2f} ms'.format(time.clock()*1e3-t0*1e3))
print(np.array_equal(I, I_copy))
Roughly 80x faster
Related
I am trying to make my own CFD solver and one of the most computationally expensive parts is solving for the pressure term. One way to solve Poisson differential equations faster is by using a multigrid method. The basic recursive algorithm for this is:
function phi = V_Cycle(phi,f,h)
% Recursive V-Cycle Multigrid for solving the Poisson equation (\nabla^2 phi = f) on a uniform grid of spacing h
% Pre-Smoothing
phi = smoothing(phi,f,h);
% Compute Residual Errors
r = residual(phi,f,h);
% Restriction
rhs = restriction(r);
eps = zeros(size(rhs));
% stop recursion at smallest grid size, otherwise continue recursion
if smallest_grid_size_is_achieved
eps = smoothing(eps,rhs,2*h);
else
eps = V_Cycle(eps,rhs,2*h);
end
% Prolongation and Correction
phi = phi + prolongation(eps);
% Post-Smoothing
phi = smoothing(phi,f,h);
end
I've attempted to implement this algorithm myself (also at the end of this question) however it is very slow and doesn't give good results so evidently it is doing something wrong. I've been trying to find why for too long and I think it's just worthwhile seeing if anyone can help me.
If I use a grid size of 2^5 by 2^5 points, then it can solve it and give reasonable results. However, as soon as I go above this it takes exponentially longer to solve and basically get stuck at some level of inaccuracy, no matter how many V-Loops are performed. at 2^7 by 2^7 points, the code takes way too long to be useful.
I think my main issue is that my implementation of a jacobian iteration is using linear algebra to calculate the update at each step. This should, in general, be fast however, the update matrix A is an n*m sized matrix, and calculating the dot product of a 2^7 * 2^7 sized matrix is expensive. As most of the cells are just zeros, should I calculate the result using a different method?
if anyone has any experience in multigrid methods, I would appreciate any advice!
Thanks
my code:
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 29 16:24:16 2020
#author: mclea
"""
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import convolve2d
from mpl_toolkits.mplot3d import Axes3D
from scipy.interpolate import griddata
from matplotlib import cm
def restrict(A):
"""
Creates a new grid of points which is half the size of the original
grid in each dimension.
"""
n = A.shape[0]
m = A.shape[1]
new_n = int((n-2)/2+2)
new_m = int((m-2)/2+2)
new_array = np.zeros((new_n, new_m))
for i in range(1, new_n-1):
for j in range(1, new_m-1):
ii = int((i-1)*2)+1
jj = int((j-1)*2)+1
# print(i, j, ii, jj)
new_array[i,j] = np.average(A[ii:ii+2, jj:jj+2])
new_array = set_BC(new_array)
return new_array
def interpolate_array(A):
"""
Creates a grid of points which is double the size of the original
grid in each dimension. Uses linear interpolation between grid points.
"""
n = A.shape[0]
m = A.shape[1]
new_n = int((n-2)*2 + 2)
new_m = int((m-2)*2 + 2)
new_array = np.zeros((new_n, new_m))
i = (np.indices(A.shape)[0]/(A.shape[0]-1)).flatten()
j = (np.indices(A.shape)[1]/(A.shape[1]-1)).flatten()
A = A.flatten()
new_i = np.linspace(0, 1, new_n)
new_j = np.linspace(0, 1, new_m)
new_ii, new_jj = np.meshgrid(new_i, new_j)
new_array = griddata((i, j), A, (new_jj, new_ii), method="linear")
return new_array
def adjacency_matrix(rows, cols):
"""
Creates the adjacency matrix for an n by m shaped grid
"""
n = rows*cols
M = np.zeros((n,n))
for r in range(rows):
for c in range(cols):
i = r*cols + c
# Two inner diagonals
if c > 0: M[i-1,i] = M[i,i-1] = 1
# Two outer diagonals
if r > 0: M[i-cols,i] = M[i,i-cols] = 1
return M
def create_differences_matrix(rows, cols):
"""
Creates the central differences matrix A for an n by m shaped grid
"""
n = rows*cols
M = np.zeros((n,n))
for r in range(rows):
for c in range(cols):
i = r*cols + c
# Two inner diagonals
if c > 0: M[i-1,i] = M[i,i-1] = -1
# Two outer diagonals
if r > 0: M[i-cols,i] = M[i,i-cols] = -1
np.fill_diagonal(M, 4)
return M
def set_BC(A):
"""
Sets the boundary conditions of the field
"""
A[:, 0] = A[:, 1]
A[:, -1] = A[:, -2]
A[0, :] = A[1, :]
A[-1, :] = A[-2, :]
return A
def create_A(n,m):
"""
Creates all the components required for the jacobian update function
for an n by m shaped grid
"""
LaddU = adjacency_matrix(n,m)
A = create_differences_matrix(n,m)
invD = np.zeros((n*m, n*m))
np.fill_diagonal(invD, 1/4)
return A, LaddU, invD
def calc_RJ(rows, cols):
"""
Calculates the jacobian update matrix Rj for an n by m shaped grid
"""
n = int(rows*cols)
M = np.zeros((n,n))
for r in range(rows):
for c in range(cols):
i = r*cols + c
# Two inner diagonals
if c > 0: M[i-1,i] = M[i,i-1] = 0.25
# Two outer diagonals
if r > 0: M[i-cols,i] = M[i,i-cols] = 0.25
return M
def jacobi_update(v, f, nsteps=1, max_err=1e-3):
"""
Uses a jacobian update matrix to solve nabla(v) = f
"""
f_inner = f[1:-1, 1:-1].flatten()
n = v.shape[0]
m = v.shape[1]
A, LaddU, invD = create_A(n-2, m-2)
Rj = calc_RJ(n-2,m-2)
update=True
step = 0
while update:
v_old = v.copy()
step += 1
vt = v_old[1:-1, 1:-1].flatten()
vt = np.dot(Rj, vt) + np.dot(invD, f_inner)
v[1:-1, 1:-1] = vt.reshape((n-2),(m-2))
err = v - v_old
if step == nsteps or np.abs(err).max()<max_err:
update=False
return v, (step, np.abs(err).max())
def MGV(f, v):
"""
Solves for nabla(v) = f using a multigrid method
"""
# global A, r
n = v.shape[0]
m = v.shape[1]
# If on the smallest grid size, compute the exact solution
if n <= 6 or m <=6:
v, info = jacobi_update(v, f, nsteps=1000)
return v
else:
# smoothing
v, info = jacobi_update(v, f, nsteps=10, max_err=1e-1)
A = create_A(n, m)[0]
# calculate residual
r = np.dot(A, v.flatten()) - f.flatten()
r = r.reshape(n,m)
# downsample resitdual error
r = restrict(r)
zero_array = np.zeros(r.shape)
# interploate the correction computed on a corser grid
d = interpolate_array(MGV(r, zero_array))
# Add prolongated corser grid solution onto the finer grid
v = v - d
v, info = jacobi_update(v, f, nsteps=10, max_err=1e-6)
return v
sigma = 0
# Setting up the grid
k = 6
n = 2**k+2
m = 2**(k)+2
hx = 1/n
hy = 1/m
L = 1
H = 1
x = np.linspace(0, L, n)
y = np.linspace(0, H, m)
XX, YY = np.meshgrid(x, y)
# Setting up the initial conditions
f = np.ones((n,m))
v = np.zeros((n,m))
# How many V cyles to perform
err = 1
n_cycles = 10
loop = True
cycle = 0
# Perform V cycles until converged or reached the maximum
# number of cycles
while loop:
cycle += 1
v_new = MGV(f, v)
if np.abs(v - v_new).max() < err:
loop = False
if cycle == n_cycles:
loop = False
v = v_new
print("Number of cycles " + str(cycle))
plt.contourf(v)
I realize that I'm not answering your question directly, but I do note that you have quite a few loops that will contribute some overhead cost. When optimizing code, I have found the following thread useful - particularly the line profiler thread. This way you can focus in on "high time cost" lines and then start to ask more specific questions regarding opportunities to optimize.
How do I get time of a Python program's execution?
I have a specific python issue, that desperately needs to be sped up by avoiding the use of a loop, yet, I am at a loss as to how to do this. I need to read in a fits image, convert this to a numpy array (roughly, 2000 x 2000 elements in size), then for each element compute the statistics of a ring of elements around it.
As I have my code now, the statistics of the ring around the element is computed with a function using masks. This is fast but, of course, I call this function 2000x2000 times (the slow part).
I am relatively new to python. I think that using the mask function is clever, but I cannot find a way around individually addressing each element. Best of thanks for any help you can provide.
# First, the function computing the statistics within a ring
around the central pixel:<br/>
# flux = image intensity at pixel (i,j)<br/>
# rad1, rad2 = inner and outer radii<br/>
# array = image array<br/>_
def snr(flux, i, j, rad1, rad2, array):
a, b = i, j
nx, ny = array.shape
y, x = np.ogrid[-a:nx-a, -b:ny-b]
mask = (x*x + y*y >= rad1*rad1) & (x*x + y*y <= rad2*rad2)
Nmask = np.count_nonzero(mask)
noise = 0.6052697 * abs(Nmask * flux - sum(array[mask]))
return noise
# Now, the call to snr for each pixel in the array data1:<br/>_
frame1 = fits.open(in_frame, mode='readonly') # read in fits file
data1 = frame1[ext].data # convert to np array
ny, nx = data1.shape # array dimensions
noise1 = zeros((ny, nx), float) # empty array
r1 = 5 # inner radius (pixels)
r2 = 7 # outer radius (pixels)
# The function is fast, but calling it 2k x 2k times is not:
for j in range(ny):
for i in range(nx):
noise1[i,j] = der_snr(data1[i,j], i, j, r1, r2, data1)
The operation that you are trying to do can be expressed as an image convolution. Try something like this:
import numpy as np
import scipy.ndimage
from astropy.io import fits
def make_kernel(inner_radius, outer_radius):
if inner_radius > outer_radius:
raise ValueError
x, y = np.ogrid[-outer_radius:outer_radius + 1, -outer_radius:outer_radius + 1]
r2 = x * x + y * y
kernel = (r2 >= inner_radius * inner_radius) & (r2 <= outer_radius * outer_radius)
return kernel
in_frame = '<file path>'
ext = '...'
frame1 = fits.open(in_frame, mode='readonly')
data1 = frame1[ext].data
inner_radius = 5
outer_radius = 7
kernel = make_kernel(inner_radius, outer_radius)
n_kernel = np.count_nonzero(kernel)
conv = scipy.ndimage.convolve(data1, kernel, mode='constant')
noise1 = 0.6052697 * np.abs(n_kernel * data1 - conv)
BACKGROUND: I am trying to build a real-time drum simulation model, for which I need really fast matrix-vector products. My matrices are of the size ~5000-10000 rows/cols, out of which only 6 entries per row are non-zero, hence I am inclined to use sparse matrices. I am using scipy.sparse module. The iterations are as below.
Vjk_plus_sparse = Vjk_minus_sparse.transpose()
Vj = Vjk_plus_sparse.dot(constant)
np.put(Vj, Nr, 0.0)
Uj[t] = Uj[t-1] + np.transpose(Vj)/fs
Vj_mat = adj_mat_sparse.multiply(Vj)
Vjk_minus_sparse = Vj_mat-Vjk_plus_sparse.multiply(end_gain)
Here, Vjk_plus_sparse, Vjk_minus_sparse and Vj_mat are sparse CSR matrices, Vj is a numpy array, and Uj is a numpy matrix where each row represents Uj(t). end_gain is an array which is a static numpy array for dampening of vibrations.
THE ISSUE: A single iteration takes about 3 ms for size = 4250. With the most significant
steps being the last 2 lines. They together take about 2.5 ms. I would ideally need it to run in 0.1 ms, which would be more than a 10x speedup. This is the maximum extent of vectorization possible for the problem, and I cannot parallelize as I am marching in time, at least physically it won't be accurate.
ATTEMPTS: I tried fiddling with the sparse data structures, and found best performance with all of them being CSR (Compressed Sparse Row), with the values as quoted above. I also tried to replace the multiply() method with a matrix multiplication, by repeating Vj, but that worsened the time, as the resultant operation would be a sparse*dense operation.
How can I speed this up within python itself? I am open to trying c++ as well, though migrating now would be a major pain. Also, since scipy is essentially based in c, would it even give that much of a speedup?
Added a complete runnable example
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.patches
import math
from mpl_toolkits import mplot3d
import numpy as np
import scipy.sparse as sp
import scipy.fftpack as spf
import matplotlib.animation as animation
import time
sqrt_3 = 1.73205080757
class Pt:
def __init__(self,x_0,y_0):
self.x_0 = x_0
self.y_0 = y_0
self.id = -1
self.neighbours = []
self.distance = (x_0**2 + y_0**2)**0.5
class Circle:
def __init__(self,radius,center):
self.radius = radius
self.center = center
self.nodes = []
def construct_mesh(self, unit):
queue = [self.center]
self.center.distance = 0
curr_id = 0
delta = [(1.,0.), (1./2, (3**0.5)/2),(-1./2, (3**0.5)/2),(-1.,0.), (-1./2,-(3**0.5)/2), (1./2,- (3**0.5)/2)]
node_dict = {}
node_dict[(self.center.x_0,self.center.y_0)] = curr_id
self.nodes.append(self.center)
curr_id+=1
while len(queue)!=0:
curr_pt = queue[0]
queue.pop(0)
# self.nodes.append(curr_pt)
# curr_id+=1
for i in delta:
temp_pt = Pt(curr_pt.x_0 + 2*unit*i[0], curr_pt.y_0 + 2*unit*i[1])
temp_pt.id = curr_id
temp_pt.distance = (temp_pt.x_0 ** 2 + temp_pt.y_0 ** 2)**0.5
# curr_id+=1
if (round(temp_pt.x_0,5), round(temp_pt.y_0,5)) not in node_dict and temp_pt.distance <= self.radius:
# print(temp_pt.x_0, temp_pt.y_0)
self.nodes.append(temp_pt)
node_dict[(round(temp_pt.x_0,5), round(temp_pt.y_0,5))] = curr_id
curr_id+=1
queue.append(temp_pt)
curr_pt.neighbours.append(temp_pt.id)
elif temp_pt.distance <= self.radius:
curr_pt.neighbours.append(node_dict[round(temp_pt.x_0,5), round(temp_pt.y_0,5)])
# print(node_dict)
def plot_neighbours(self, pt):
x = []
y = []
x.append(pt.x_0)
y.append(pt.y_0)
for i in (pt.neighbours):
x.append(self.nodes[i].x_0)
y.append(self.nodes[i].y_0)
plt.scatter(x,y)
plt.axis('scaled')
def boundary_node_ids(self):
boundary_nodes = []
for j in range(len(self.nodes)):
if(len(self.nodes[j].neighbours) < 6):
boundary_nodes.append(j)
return boundary_nodes
def add_rim(self, boundary_node_ids, unit):
c = self.center
rim_ids = []
N = len(self.nodes)
for i in range(len(boundary_node_ids)):
d = self.nodes[boundary_node_ids[i]].distance
xp = self.nodes[boundary_node_ids[i]].x_0
yp = self.nodes[boundary_node_ids[i]].y_0
xnew = xp + xp*unit/d
ynew = yp + yp*unit/d
new_point = Pt(xnew, ynew)
new_point.id = N + i
rim_ids.append(N+i)
self.nodes.append(new_point)
self.nodes[boundary_node_ids[i]].neighbours.append(new_point.id)
self.nodes[N+i].neighbours.append(boundary_node_ids[i])
return rim_ids
def find_nearest_point(mesh, pt):
distances_from_center = np.zeros(len(mesh.nodes))
for i in xrange(len(mesh.nodes)):
distances_from_center[i] = mesh.nodes[i].distance
target_distance = pt.distance
closest_point_id = np.argmin(np.abs(distances_from_center-target_distance))
return closest_point_id
def init_impulse(mesh, impulse, Vj, poi, roi):
data = []
for i in range(len(Vj)):
r = ((mesh.nodes[i].x_0 - mesh.nodes[poi].x_0)**2 + (mesh.nodes[i].y_0 - mesh.nodes[poi].y_0)**2)**0.5
Vj[i] = max(0, impulse*(1. - (r/roi)))
if i in Nr:
Vj[i] = 0.
for k in mesh.nodes[i].neighbours:
data.append(np.asscalar(Vj[i])/2.)
return Vj, data
r = 0.1016 #Radius of drum head
# rho = 2500 #Density of drum head
thickness = 0.001 #Thickness of membrane
# tension = 1500 #Tension in membrane in N
param = 0.9
c = (param/thickness)**(0.5) #Speed of wave in string
duration = 0.25
fs = 4000
delta = c/fs
center = Pt(0,0)
point_of_impact = Pt(r/2., 0)
center.id = 0
mesh = Circle(r,center)
mesh.construct_mesh(delta)
N = len(mesh.nodes)
Nb = []
for j in range(N):
if len(mesh.nodes[j].neighbours) < 6:
Nb.append(j)
Nr = mesh.add_rim(Nb, delta)
N = len(mesh.nodes)
print(N)
row_ind = []
col_ind = []
for j in range(N):
for k in mesh.nodes[j].neighbours:
row_ind.append(j)
col_ind.append(k)
data = np.ones(len(col_ind))
adj_mat_sparse = sp.csr_matrix((data, (row_ind, col_ind)), shape = (N,N))
Vjk_plus = sp.csr_matrix([N, N])
Vj = np.zeros([N,1])
Uj = np.zeros([int(duration*fs), N])
Vj_mat = sp.csc_matrix([N,N])
closest_point_id = find_nearest_point(mesh, point_of_impact)
Vj, Vjk_data = init_impulse(mesh, -10.0, Vj, closest_point_id, r/10.)
Vjk_minus_sparse = sp.csr_matrix((Vjk_data, (row_ind, col_ind)), shape = (N,N))
constant = (1./3)*np.ones([N,1])
Vjk_plus = Vjk_minus_sparse.transpose()
np.put(Vj, Nr, 0.0)
Uj[1] = Uj[0] + np.transpose(Vj)/fs
Vj_mat = adj_mat_sparse.multiply(Vj)
Vjk_minus_sparse = Vj_mat - Vjk_plus
end_gain = np.ones([N,1])
end_gain[Nr] = 1.0
for t in range(2,int(duration*fs)):
Vjk_plus = Vjk_minus_sparse.transpose()
Vj = Vjk_plus.dot(constant)
np.put(Vj, Nr, 0.0)
Uj[t] = Uj[t-1] + np.transpose(Vj)/fs
Vj_mat = adj_mat_sparse.multiply(Vj)
Vjk_minus_sparse = Vj_mat-Vjk_plus.multiply(end_gain)
I like to prototype algorithms in Matlab, but I have the requirement of putting them on a server that also runs quite a bit of Python code. Hence I quickly converted the code to Python and compared the two. The Matlab implementation runs ~1000 times faster (from timing function calls - no profiling). Anyone know off hand why the performance of Python is so slow?
Matlab
% init random data
w = 800;
h = 1200;
hmap = zeros(w,h);
npts = 250;
for i=1:npts
hmap(randi(w),randi(h)) = hmap(randi(w),randi(h))+1;
end
% Params
disksize = 251;
nBreaks = 25;
saturation = .9;
floorthresh =.05;
fh = fspecial('gaussian', disksize, disksize/7);
hmap = conv2(hmap, fh, 'same');
% Scaling, paritioning etc
hmap = hmap/(max(max(hmap)));
hmap(hmap<floorthresh) = 0;
hmap = round(nBreaks * hmap)/nBreaks;
hmap = hmap * (1/saturation);
% Show the image
imshow(hmap, [0,1])
colormap('jet')
Python
import numpy as np
from scipy.signal import convolve2d as conv2
# Test data parameters
w = 800
h = 1200
npts = 250
# generate data
xvals = np.random.randint(w, size=npts)
yvals = np.random.randint(h, size=npts)
# Heatmap parameters
gaussianSize = 250
nbreaks = 25
# Preliminary function definitions
def populateMat(w, h, xvals, yvals):
container = np.zeros((w,h))
for idx in range(0,xvals.size):
x = xvals[idx]
y = yvals[idx]
container[x,y] += 1
return container
def makeGaussian(size, fwhm):
x = np.arange(0, size, 1, float)
y = x[:,np.newaxis]
x0 = y0 = size // 2
return np.exp(-4*np.log(2) * ((x-x0)**2 + (y-y0)**2) / fwhm**2)
# Create the data matrix
dmat = populateMat(w,h,xvals,yvals)
h = makeGaussian(gaussianSize, fwhm=gaussianSize/2)
# Convolve
dmat2 = conv2(dmat, h, mode='same')
# Scaling etc
dmat2 = dmat2 / dmat2.max()
dmat2 = np.round(nbreaks*dmat2)/nbreaks
# Show
imshow(dmat2)
Ok, problem solved for me thanks to suggestion from #Yves Daust's comments;
The filter scipy.ndimage.filters.gaussian_filter utilises the separability of the kernel and reduces the running time to within a single order of magnitude of the matlab implementation.
import numpy as np
from scipy.ndimage.filters import gaussian_filter as gaussian
# Test data parameters
w = 800
h = 1200
npts = 250
# generate data
xvals = np.random.randint(w, size=npts)
yvals = np.random.randint(h, size=npts)
# Heatmap parameters
gaussianSize = 250
nbreaks = 25
# Preliminary function definitions
def populateMat(w, h, xvals, yvals):
container = np.zeros((w,h))
for idx in range(0,xvals.size):
x = xvals[idx]
y = yvals[idx]
container[x,y] += 1
return container
# Create the data matrix
dmat = populateMat(w,h,xvals,yvals)
# Convolve
dmat2 = gaussian(dmat, gaussianSize/7)
# Scaling etc
dmat2 = dmat2 / dmat2.max()
dmat2 = np.round(nbreaks*dmat2)/nbreaks
# Show
imshow(dmat2)
Main Problem: How can the scipy.signal.cwt() function be inversed.
I have seen where Matlab has an inverse continuous wavelet transform function which will return the original form of the data by inputting the wavelet transform, although you can filter out the slices you don't want.
MATALAB inverse cwt funciton
Since scipy doesn't appear to have the same function, I have been trying to figure out how to get the data back in the same form, while removing the noise and background.
How do I do this?
I tried squaring it to remove negative values, but this gives me values way to large and not quite right.
Here is what I have been trying:
# Compute the wavelet transform
widths = range(1,11)
cwtmatr = signal.cwt(xy['y'], signal.ricker, widths)
# Maybe we multiple by the original data? and square?
WT_to_original_data = (xy['y'] * cwtmatr)**2
And here is a fully compilable short script to show you the type of data I am trying to get and what I have etc.:
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
# Make some random data with peaks and noise
def make_peaks(x):
bkg_peaks = np.array(np.zeros(len(x)))
desired_peaks = np.array(np.zeros(len(x)))
# Make peaks which contain the data desired
# (Mid range/frequency peaks)
for i in range(0,10):
center = x[-1] * np.random.random() - x[0]
amp = 60 * np.random.random() + 10
width = 10 * np.random.random() + 5
desired_peaks += amp * np.e**(-(x-center)**2/(2*width**2))
# Also make background peaks (not desired)
for i in range(0,3):
center = x[-1] * np.random.random() - x[0]
amp = 40 * np.random.random() + 10
width = 100 * np.random.random() + 100
bkg_peaks += amp * np.e**(-(x-center)**2/(2*width**2))
return bkg_peaks, desired_peaks
x = np.array(range(0, 1000))
bkg_peaks, desired_peaks = make_peaks(x)
y_noise = np.random.normal(loc=30, scale=10, size=len(x))
y = bkg_peaks + desired_peaks + y_noise
xy = np.array( zip(x,y), dtype=[('x',float), ('y',float)])
# Compute the wavelet transform
# I can't figure out what the width is or does?
widths = range(1,11)
# Ricker is 2nd derivative of Gaussian
# (*close* to what *most* of the features are in my data)
# (They're actually Lorentzians and Breit-Wigner-Fano lines)
cwtmatr = signal.cwt(xy['y'], signal.ricker, widths)
# Maybe we multiple by the original data? and square?
WT = (xy['y'] * cwtmatr)**2
# plot the data and results
fig = plt.figure()
ax_raw_data = fig.add_subplot(4,3,1)
ax = {}
for i in range(0, 11):
ax[i] = fig.add_subplot(4,3, i+2)
ax_desired_transformed_data = fig.add_subplot(4,3,12)
ax_raw_data.plot(xy['x'], xy['y'], 'g-')
for i in range(0,10):
ax[i].plot(xy['x'], WT[i])
ax_desired_transformed_data.plot(xy['x'], desired_peaks, 'k-')
fig.tight_layout()
plt.show()
This script will output this image:
Where the first plot is the raw data, the middle plots are the wavelet transforms and the last plot is what I want to get out as the processed (background and noise removed) data.
Does anyone have any suggestions? Thank you so much for the help.
I ended up finding a package which provides an inverse wavelet transform function called mlpy. The function is mlpy.wavelet.uwt. This is the compilable script I ended up with which may interest people if they are trying to do noise or background removal:
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
import mlpy.wavelet as wave
# Make some random data with peaks and noise
############################################################
def gen_data():
def make_peaks(x):
bkg_peaks = np.array(np.zeros(len(x)))
desired_peaks = np.array(np.zeros(len(x)))
# Make peaks which contain the data desired
# (Mid range/frequency peaks)
for i in range(0,10):
center = x[-1] * np.random.random() - x[0]
amp = 100 * np.random.random() + 10
width = 10 * np.random.random() + 5
desired_peaks += amp * np.e**(-(x-center)**2/(2*width**2))
# Also make background peaks (not desired)
for i in range(0,3):
center = x[-1] * np.random.random() - x[0]
amp = 80 * np.random.random() + 10
width = 100 * np.random.random() + 100
bkg_peaks += amp * np.e**(-(x-center)**2/(2*width**2))
return bkg_peaks, desired_peaks
# make x axis
x = np.array(range(0, 1000))
bkg_peaks, desired_peaks = make_peaks(x)
avg_noise_level = 30
std_dev_noise = 10
size = len(x)
scattering_noise_amp = 100
scat_center = 100
scat_width = 15
scat_std_dev_noise = 100
y_scattering_noise = np.random.normal(scattering_noise_amp, scat_std_dev_noise, size) * np.e**(-(x-scat_center)**2/(2*scat_width**2))
y_noise = np.random.normal(avg_noise_level, std_dev_noise, size) + y_scattering_noise
y = bkg_peaks + desired_peaks + y_noise
xy = np.array( zip(x,y), dtype=[('x',float), ('y',float)])
return xy
# Random data Generated
#############################################################
xy = gen_data()
# Make 2**n amount of data
new_y, bool_y = wave.pad(xy['y'])
orig_mask = np.where(bool_y==True)
# wavelet transform parameters
levels = 8
wf = 'h'
k = 2
# Remove Noise first
# Wave transform
wt = wave.uwt(new_y, wf, k, levels)
# Matrix of the difference between each wavelet level and the original data
diff_array = np.array([(wave.iuwt(wt[i:i+1], wf, k)-new_y) for i in range(len(wt))])
# Index of the level which is most similar to original data (to obtain smoothed data)
indx = np.argmin(np.sum(diff_array**2, axis=1))
# Use the wavelet levels around this region
noise_wt = wt[indx:indx+1]
# smoothed data in 2^n length
new_y = wave.iuwt(noise_wt, wf, k)
# Background Removal
error = 10000
errdiff = 100
i = -1
iter_y_dict = {0:np.copy(new_y)}
bkg_approx_dict = {0:np.array([])}
while abs(errdiff)>=1*10**-24:
i += 1
# Wave transform
wt = wave.uwt(iter_y_dict[i], wf, k, levels)
# Assume last slice is lowest frequency (background approximation)
bkg_wt = wt[-3:-1]
bkg_approx_dict[i] = wave.iuwt(bkg_wt, wf, k)
# Get the error
errdiff = error - sum(iter_y_dict[i] - bkg_approx_dict[i])**2
error = sum(iter_y_dict[i] - bkg_approx_dict[i])**2
# Make every peak higher than bkg_wt
diff = (new_y - bkg_approx_dict[i])
peak_idxs_to_remove = np.where(diff>0.)[0]
iter_y_dict[i+1] = np.copy(new_y)
iter_y_dict[i+1][peak_idxs_to_remove] = np.copy(bkg_approx_dict[i])[peak_idxs_to_remove]
# new data without noise and background
new_y = new_y[orig_mask]
bkg_approx = bkg_approx_dict[len(bkg_approx_dict.keys())-1][orig_mask]
new_data = diff[orig_mask]
##############################################################
# plot the data and results
fig = plt.figure()
ax_raw_data = fig.add_subplot(121)
ax_WT = fig.add_subplot(122)
ax_raw_data.plot(xy['x'], xy['y'], 'g')
for bkg in bkg_approx_dict.values():
ax_raw_data.plot(xy['x'], bkg[orig_mask], 'k')
ax_WT.plot(xy['x'], new_data, 'y')
fig.tight_layout()
plt.show()
And here is the output I am getting now:
As you can see, there is still a problem with the background removal (it shifts to the right after each iteration), but it is a different question which I will address here.