Given a set of circles with random centers and radii, I would like to be able to prune this set so that if overlap between circles occurs, only the largest circle is retained. This is a similar question to the one answered here, but the problem listed there seeks to retain the maximum number of non-overlapping circles, from what I understand. I'd like to be able to adapt the ILP solution given there to my needs, if possible, although a brute-force "search and remove"-type approach would be fine too. The latter is what I've tried so far, but failed to accomplish.
import matplotlib.pyplot as plt
from numpy.random import rand, seed
seed(1)
N = 25 # number of circles
L = 10 # domain size
Rmin = 0.5 # min radius
Rmax = 1 # max radius
cx = rand(N)*(L-2*Rmax) + Rmax
cy = rand(N)*(L-2*Rmax) + Rmax
r = rand(N)*(Rmax-Rmin) + Rmin
# Plotting
for i in range(N):
plt.gca().add_artist(plt.Circle((cx[i], cy[i]), r[i], ec='black', fc='white'))
plt.axis('image')
plt.xlim(0,L)
plt.ylim(0,L)
plt.show()
Desired Result:
It got a bit messy, but this creates the Output you wanted.
import matplotlib.pyplot as plt
from numpy.random import rand, seed
import math
import numpy as np
import pandas as pd
def find_larger(df_circles_2, idx):
found_greater = False
for i,row in df_circles_2.iterrows():
if i != idx:
distance = math.sqrt( (row['x'] - df_circles_2['x'][idx])**2 + (row['y'] - df_circles_2['y'][idx])**2 )
if distance < (row['r'] + df_circles_2['r'][i]):
if row['r'] > df_circles_2['r'][idx] and (row['keep'] != "discard"):
if df_circles['keep'][i] == "keep":
return "discard"
found_greater = True
if found_greater:
return "undecided"
else:
return "keep"
seed(1)
N = 25 # number of circles
L = 10 # domain size
Rmin = 0.5 # min radius
Rmax = 1 # max radius
cx = rand(N)*(L-2*Rmax) + Rmax
cy = rand(N)*(L-2*Rmax) + Rmax
r = rand(N)*(Rmax-Rmin) + Rmin
# Plotting
for i in range(N):
plt.gca().add_artist(plt.Circle((cx[i], cy[i]), r[i], ec='black', fc='white'))
plt.gca().add_artist(plt.Text(cx[i], cy[i], text = str(i)))
plt.axis('image')
plt.xlim(0,L)
plt.ylim(0,L)
plt.show()
# Searching:
df_circles = pd.DataFrame(np.array([cx, cy, r]).T, columns = ['x', 'y', 'r'])
df_circles['keep'] = "undecided"
while(df_circles['keep'].str.contains('undecided').any()):
for i, row in df_circles.iterrows():
if row['keep'] == "undecided":
df_circles.at[i, 'keep'] = find_larger(df_circles, i)
# Plotting 2
plt.figure(2)
for i in range(N):
if df_circles['keep'][i] == "keep":
plt.gca().add_artist(plt.Circle((cx[i], cy[i]), r[i], ec='black', fc='black'))
else:
plt.gca().add_artist(plt.Circle((cx[i], cy[i]), r[i], ec='black', fc='white'))
plt.axis('image')
plt.xlim(0,L)
plt.ylim(0,L)
plt.show()
Related
I have a Python project where I need to redraw a line many times with the points in random places but keeping the line's shape and point count roughly the same. The final output will be using polygonal points and not Bezier paths (though I wouldn't be opposed to using Bezier as an intermediary step).
This animation is demonstrating how the points could move along the line to different positions while maintaining the general shape.
I also have a working example below where I'm moving along the line and picking random new points between existing points (the red line, below). It works okay, but I'd love to hear some other approaches I might take if someone knows of a better one?
Though this code is using matplotlib to demonstrate the line, the final program will not.
import numpy as np
from matplotlib import pyplot as plt
import random
from random import (randint,uniform)
def move_along_line(p1, p2, scalar):
distX = p2[0] - p1[0]
distY = p2[1] - p1[1]
modX = (distX * scalar) + p1[0]
modY = (distY * scalar) + p1[1]
return [modX, modY]
x_coords = [213.5500031,234.3809357,255.211853,276.0427856,296.8737183,317.7046204,340.1997681,364.3751221,388.5505066,414.8896484,444.5192261,478.5549622,514.5779419,545.4779053,570.3830566,588.0241699,598.2469482,599.772583,596.758728,593.7449341,590.7310791,593.373291,610.0373535,642.1326294,677.4451904,710.0697021,737.6887817,764.4020386,791.1152954,817.8284912,844.541687,871.2550049,897.9682007,924.6813965,951.3945923,978.1078491,1009.909546,1042.689941,1068.179199,1089.543091]
y_coords = [487.3099976,456.8832703,426.4565125,396.0297852,365.6030273,335.1763,306.0349426,278.1913452,250.3477478,224.7166748,203.0908051,191.2358704,197.6810608,217.504303,244.4946136,276.7698364,312.0551453,348.6885986,385.4395447,422.1904297,458.9414063,495.5985413,527.0128479,537.1477661,527.6642456,510.959259,486.6988525,461.2799683,435.8611145,410.4422913,385.023468,359.6045532,334.18573,308.7669067,283.3480835,257.929184,239.4429474,253.6099091,280.1803284,310.158783]
plt.plot(x_coords,y_coords,color='b')
plt.scatter(x_coords,y_coords,s=2)
new_line_x = []
new_line_y = []
for tgt in range(len(x_coords)-1):
#tgt = randint(0, len(x_coords)-1)
next_pt = tgt+1
new_pt = move_along_line([x_coords[tgt],y_coords[tgt]], [x_coords[next_pt],y_coords[next_pt]], uniform(0, 1))
new_line_x.append(new_pt[0])
new_line_y.append(new_pt[1])
plt.plot(new_line_x,new_line_y,color='r')
plt.scatter(new_line_x,new_line_y,s=10)
ax = plt.gca()
ax.set_aspect('equal')
plt.show()
Thank you very much!
I'm not sure if this is the most optimal way to do this but essentially you want to follow these steps:
Calculate the distance of the entire path, and the distance between all the points. Then for each point, tally the distances to that point.
Generate a new set of random points along the path starting with 0, then for each pair of points calculate a random distance: random value between 0 and 1 * total length of the path.
Sort these distances from smallest to largest.
For each random distance loop over the distances find the index where the random distance is > than distance i, and less than distance i+1. Interpolate new x and y values from these points.
from matplotlib import pyplot as plt
from scipy.interpolate import interp1d
import numpy
import random
import math
x_coords = [195.21,212.53,237.39,270.91,314.21,368.43,434.69,514.1,607.8,692.69,746.98,773.8,776.25,757.45,720.52,668.55,604.68,545.37,505.79,487.05,490.27,516.58,567.09,642.93,745.2,851.5,939.53,1010.54,1065.8,1106.58,1134.15,1149.75,1154.68]
y_coords = [195.34,272.27,356.59,438.98,510.14,560.76,581.52,563.13,496.27,404.39,318.83,242.15,176.92,125.69,91.02,75.48,81.62,113.49,168.57,239.59,319.29,400.38,475.6,537.67,579.32,586.78,558.32,504.7,436.69,365.05,300.55,253.95,236.03]
n_points = 100
x_coords = numpy.array(x_coords)
x_min = x_coords.min()
x_max = x_coords.max()
x_range = x_max - x_min
distances = []
tallied_distances = [0]
tallied_distance = 0
for i in range(0, len(x_coords) -1):
xi = x_coords[i]
xf = x_coords[i + 1]
yi= y_coords[i]
yf = y_coords[i+1]
d = math.sqrt((xf-xi)**2 + (yf-yi)**2)
tallied_distance += d
tallied_distances.append(tallied_distance)
random_distances_along_line = [0]
for i in range(0, n_points-2):
random_distances_along_line.append(random.random()*tallied_distance)
random_distances_along_line.sort()
new_x_points = [x_coords[0]]
new_y_points = [y_coords[0]]
for i in range(0, len(random_distances_along_line)):
dt = random_distances_along_line[i]
for j in range(0, len(tallied_distances)-1):
di = tallied_distances[j]
df = tallied_distances[j+1]
if di < dt and dt < df:
difference = dt - di
xi = x_coords[j]
xf = x_coords[j+1]
yi = y_coords[j]
yf = y_coords[j+1]
xt = xi+(xf-xi)*difference/(df-di)
yt = yi+(yf-yi)*difference/(df-di)
new_x_points.append(xt)
new_y_points.append(yt)
new_x_points.append(x_coords[len(x_coords)-1])
new_y_points.append(y_coords[len(y_coords)-1])
plt.plot(new_x_points, new_y_points)
plt.scatter(new_x_points, new_y_points,s=2)
ax = plt.gca()
ax.set_aspect('equal')
plt.show()
I want to get all coordinates where two graphs intersect in Matplotlib. I tried to work with DataFrames but did not come to any results.
In this example i drew a circle and a line (for simplicity). And I would want to have a some form coordinates where the line enters the circle and leaves it.
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
r = 1
n = 64
t = np.linspace(0, 2 * np.pi, n + 1)
x_circle = r * np.cos(t) + 1
y_circle = r * np.sin(t) + 1
df = pd.DataFrame({'x':x, 'y':y})
plt.plot(df['x'], df['y'])
plt.plot(range(4), np.array(range(4))*0.6)
plt.show()
The essence of this problem is to obtain the point(s) of intersection between 2 geometries. Then to create a plot to visualize the result. There are several python's modules that can handlle this problem. I choose shapely to demonstrate the steps to get the result. Matplotlib is used to plot, not to determine the intersection. Here is the code and the output plot. Note that I avoid using pandas because it is not relevant.
import matplotlib.pyplot as plt
import numpy as np
from shapely import wkt
from shapely.geometry import LineString
from shapely.wkt import loads
# prep data
r = 1
n = 64
t = np.linspace(0, 2 * np.pi, n + 1)
x_circle = r * np.cos(t) + 1
y_circle = r * np.sin(t) + 1
# create the linestring of circle's perimeter
wktcode1 = "LINESTRING ("
for i,(x,y) in enumerate(zip(x_circle, y_circle)):
if i!=len(x_circle)-1:
wktcode1 += str(x)+" "+str(y)+", "
else:
wktcode1 += str(x)+" "+str(y)+")"
#print(wktcode1)
circle_perim = loads(wktcode1) #a geometry object
# create another geometry, for the line
wktcode2 = "LINESTRING ("
xs = range(4)
ys = np.array(range(4))*0.6
for i,(x,y) in enumerate(zip(xs, ys)):
if i!=len(range(4))-1:
wktcode2 += str(x)+" "+str(y)+", "
else:
wktcode2 += str(x)+" "+str(y)+")"
#print(wktcode2)
pass
line_str = loads(wktcode2) #a geometry object
# check if they are intersect
ixx = circle_perim.intersection(line_str)
# visualization of the intersection
# plot circle
plt.plot(x_circle, y_circle)
# plot line
plt.plot(range(4), np.array(range(4))*0.6)
if ixx:
# plot intersection points
for ea in ixx:
plt.scatter(*ea.xy)
print(*ea.xy)
plt.show()
The output plot:
To get points: Subtract x_circle and y_circle and compare it with 0.
To plot: Use plt.scatter() - documentation.
Full code:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
r = 1
n = 64
t = np.linspace(0, 2 * np.pi, n + 1)
x_circle = r * np.cos(t) + 1
y_circle = r * np.sin(t) + 1
#get intersecting points
pts = np.reshape(np.where((np.array(x_circle, dtype = 'float16')-np.array(y_circle, dtype = 'float16') )== 0)[0], (1, -1))
pts = np.append(pts, np.array(x_circle)[pts], axis = 0)
print(pts)
plt.plot(x_circle)
plt.plot(y_circle)
plt.scatter(pts[0], pts[1], s = 200, marker='o')
#plt.plot(range(4), np.array(range(4))*0.6)
plt.show()
Note: Since all are float values, while calculating np.sin() and
np.cos(), it rounds off the last digit. Hence, while comparing, I
changed it into float16 values.
I am working with a projected coordinate dataset that contains x,y,z data (432 line csv with X Y Z headers, not attached). I wish to import this dataset, calculate a new grid based on user input and then start performing some statistics on points that fall within the new grid. I've gotten to the point that I have two lists (raw_lst with 431(x,y,z) and grid_lst with 16(x,y) (calling n,e)) but when I try to iterate through to start calculating average and density for the new grid it all falls apart. I am trying to output a final list that contains the grid_lst x and y values along with the calculated average z and density values.
I searched numpy and scipy libraries thinking that they may have already had something to do what I am wanting but was unable to find anything. Let me know if any of you all have any thoughts.
sample_xyz_reddot_is_newgrid_pictoral_representation
import pandas as pd
import math
df=pd.read_csv("Sample_xyz.csv")
N=df["X"]
E=df["Y"]
Z=df["Z"]
#grid = int(input("Specify grid value "))
grid = float(0.5) #for quick testing the grid value is set to 0.5
#max and total calculate the input area extents
max_N = math.ceil(max(N))
max_E = math.ceil(max(E))
min_E = math.floor(min(E))
min_N = math.floor(min(N))
total_N = max_N - min_N
total_E = max_E - min_E
total_N = int(total_N/grid)
total_E = int(total_E/grid)
#N_lst and E_lst calculate the mid points based on the input file extents and the specified grid file
N_lst = []
n=float(max_N)-(0.5*grid)
for x in range(total_N):
N_lst.append(n)
n=n-grid
E_lst = []
e=float(max_E)-(0.5*grid)
for x in range(total_E):
E_lst.append(e)
e=e-grid
grid_lst = []
for n in N_lst:
for e in E_lst:
grid_lst.append((n,e))
#converts the imported dataframe to list
raw_lst = df.to_records(index=False)
raw_lst = list(raw_lst)
#print(grid_lst) # grid_lst is a list of 16 (n,e) tuples for the new grid coordinates.
#print(raw_lst) # raw_lst is a list of 441 (n,e,z) tuples from the imported file - calling these x,y,z.
#The calculation where it all falls apart.
t=[]
average_lst = []
for n, e in grid_lst:
for x, y, z in raw_lst:
if n >= x-(grid/2) and n <= x+(grid/2) and e >= y-(grid/2) and e <= y+(grid/2):
t.append(z)
average = sum(t)/len(t)
density = len(t)/grid
average_lst = (n,e,average,density)
print(average_lst)
# print("The length of this list is " + str(len(average_lst)))
# print("The length of t is " + str(len(t)))
SAMPLE CODE FOR RUNNING
import random
grid=5
raw_lst = [(random.randrange(0,10), random.randrange(0,10), random.randrange(0,2))for i in range(100)]
grid_lst = [(2.5,2.5),(2.5,7.5),(7.5,2.5),(7.5,7.5)]
t=[]
average_lst = []
for n, e in grid_lst:
for x, y, z in raw_lst:
if n >= x-(grid/2) and n <= x+(grid/2) and e >= y-(grid/2) and e <= y+(grid/2):
t.append(z)
average = sum(t)/len(t)
density = len(t)/grid
average_lst = (n,e,average,density)
print(average_lst)
Some advices
when working with arrays, use numpy. It has more functionalities
when working with grids it's often more handy the use x-coords, y-coords as single arrays
Comments to the solution
obviousley you have a grid, or rather a box, grd_lst. We generate it as a numpy meshgrid (gx,gy)
you have a number of points raw_list. We generate each elemnt of it as 1-dimensional numpy arrays
you want to select the r_points that are in the g_box. We use the percentage formula for that: tx = (rx-gxMin)/(gxMax-gxMin)
if tx, ty are within [0..1] we store the index
as an intermediate result we get all indices of raw_list that are within the g_box
with that index you can extract the elements of raw_list that are within the g_box and can do some statistics
note that I have omitted the z-coord. You will have to improve this solution.
--
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.colors as mclr
from matplotlib import cm
f10 = 'C://gcg//picStack_10.jpg' # output file name
f20 = 'C://gcg//picStack_20.jpg' # output file name
def plot_grid(gx,gy,rx,ry,Rx,Ry,fOut):
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(111)
myCmap = mclr.ListedColormap(['blue','lightgreen'])
ax.pcolormesh(gx, gy, gx, edgecolors='b', cmap=myCmap, lw=1, alpha=0.3)
ax.scatter(rx,ry,s=150,c='r', alpha=0.7)
ax.scatter(Rx,Ry,marker='s', s=150,c='gold', alpha=0.5)
ax.set_aspect('equal')
plt.savefig(fOut)
plt.show()
def get_g_grid(nx,ny):
ix = 2.5 + 5*np.linspace(0,1,nx)
iy = 2.5 + 5*np.linspace(0,1,ny)
gx, gy = np.meshgrid(ix, iy, indexing='ij')
return gx,gy
def get_raw_points(N):
rx,ry,rz,rv = np.random.randint(0,10,N), np.random.randint(0,10,N), np.random.randint(0,2,N), np.random.uniform(low=0.0, high=1.0, size=N)
return rx,ry,rz,rv
N = 100
nx, ny = 2, 2
gx,gy = get_base_grid(nx,ny)
rx,ry,rz,rv = get_raw_points(N)
plot_grid(gx,gy,rx,ry,0,0,f10)
def get_the_points_inside(gx,gy,rx,ry):
#----- run throuh the g-grid -------------------------------
nx,ny = gx.shape
N = len(rx)
index = []
for jx in range(0,nx-1):
for jy in range(0,ny-1):
#--- run through the r_points
for jr in range(N):
test_x = (rx[jr]-gx[jx,jy]) / (gx[jx+1,jy] - gx[jx,jy])
test_y = (ry[jr]-gy[jx,jy]) / (gy[jx,jy+1] - gy[jx,jy])
if (0.0 <= test_x <= 1.0) and (0.0 <= test_y <= 1.0):
index.append(jr)
return index
index = get_the_points_inside(gx,gy,rx,ry)
Rx, Ry, Rz, Rv = rx[index], ry[index], rz[index], rv[index]
plot_grid(gx,gy,rx,ry,Rx,Ry,f20)
I am plotting 2D images of energy and density distribution. There is always a slight misalignment in the mapping where the very first "columns" seem to go to the last columns during the plot.
I have attach link to for data test file.
Data files
Here is the plot :
Is there anything to prevent this ?
The partial code in plotting is as follows:
import numpy as np
import matplotlib.pyplot as plt
import pylab as pyl
import scipy.stats as ss
import matplotlib.ticker as ticker
import matplotlib.transforms as tr
#%matplotlib inline
pi = 3.1415
n = 5e24 # density plasma
m = 9.109e-31
eps = 8.85e-12
e = 1.6021725e-19
c = 3e8
wp=np.sqrt(n*e*e/(m*eps))
kp = np.sqrt(n*e*e/(m*eps))/c #plasma wavenumber
case=400
## decide on the target range of analysis for multiples
start= 20500
end = 21500
gap = 1000
## Multiples plots
def target_range (start, end, gap):
while start<= end:
yield start
start += gap
for step in target_range(start, end, gap):
fdata =np.genfromtxt('./beam_{}'.format(step)).reshape(-1,6)
## dimension, dt, and superpaticle
xBoxsize = 50e-6 #window size
yBoxsize = 80e-6 #window size
xbind = 10
ybind = 1
dx = 4e-8 #cell size
dy = 4e-7 #cell size
dz = 1e-6 #assume to be same as dy
dt = 1.3209965456e-16
sptcl = 1.6e10
xsub = 0e-6
xmax = dt*step*c
xmin = xmax - xBoxsize
ysub = 1e-7
ymin = ysub #to make our view window
ymax = yBoxsize - ysub
xbins = int((xmax - xmin)/(dx*xbind))
ybins = int((ymax - ymin)/(dy*ybind))
#zbins = int((zmax - zmin)/dz) #option for 3D
# To make or define "data_arr" as a matrix with 2D array size 'xbins x ybins'
data_arr = np.zeros((2,xbins,ybins), dtype=np.float)
for line in fdata:
x = int((line[0]-xmin)/(dx*xbind))
y = int((line[1]-ymin)/(dy*ybind))
#z = int((line[2]-zmin)/dz)
if x >= xbins: x = xbins - 1
if y >= ybins: y = ybins - 1
#if z >= zbins: z = zbins - 1
data_arr[0, x, y] = data_arr[0,x, y] + 1 #cummulative adding up the number of particles
energy_total = np.sqrt(1+ line[2]*line[2]/(c*c)+line[3]*line[3]/(c*c))/0.511
data_arr[1, x, y] += energy_total
#array 1 tells us the energy while array 0 tells us the particles
## make average energy , total energy/particle number
np.errstate(divide='ignore',invalid='ignore')
en_arr = np.true_divide(data_arr[1],data_arr[0]) # total energy/number of particles
en_arr[en_arr == np.inf] = 0
en_arr = np.nan_to_num(en_arr)
en_arr = en_arr.T
## This part is real density of the distribution
data_arr[0]= data_arr[0] * sptcl/dx/dy #in m-3
d = data_arr[0].T
## Plot and save density and energy distribution figures
den_dist=plt.figure(1)
plt.imshow(d,origin='lower', aspect = 'auto',cmap =plt.get_cmap('gnuplot'),extent =(xmin/1e-3,xmax/1e-3,ymin/1e-6,ymax/1e-6))
plt.title('Density_dist [m-3]_{}'.format(step))
plt.xlabel('distance[mm]')
plt.ylabel('y [um]')
plt.colorbar()
plt.show()
den_dist.savefig("./Qen_distribution_{}.png".format(step),format ='png')
#note:cmap: rainbow, hot,jet,gnuplot,plasma
energy_dist=plt.figure(2)
plt.imshow(en_arr, origin ='lower',aspect = 'auto', cmap =plt.get_cmap('jet'),extent =(xmin/1e-3,xmax/1e-3,ymin/1e-6,ymax/1e-6))
plt.title ('Energy_dist [MeV]_{} '.format(step))
plt.xlabel('distance[mm]')
plt.ylabel('y [um]')
plt.colorbar()
plt.show()
energy_dist.savefig("./Qenergy_distribution_{}.png".format(step),format ='png')
Main Problem: How can the scipy.signal.cwt() function be inversed.
I have seen where Matlab has an inverse continuous wavelet transform function which will return the original form of the data by inputting the wavelet transform, although you can filter out the slices you don't want.
MATALAB inverse cwt funciton
Since scipy doesn't appear to have the same function, I have been trying to figure out how to get the data back in the same form, while removing the noise and background.
How do I do this?
I tried squaring it to remove negative values, but this gives me values way to large and not quite right.
Here is what I have been trying:
# Compute the wavelet transform
widths = range(1,11)
cwtmatr = signal.cwt(xy['y'], signal.ricker, widths)
# Maybe we multiple by the original data? and square?
WT_to_original_data = (xy['y'] * cwtmatr)**2
And here is a fully compilable short script to show you the type of data I am trying to get and what I have etc.:
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
# Make some random data with peaks and noise
def make_peaks(x):
bkg_peaks = np.array(np.zeros(len(x)))
desired_peaks = np.array(np.zeros(len(x)))
# Make peaks which contain the data desired
# (Mid range/frequency peaks)
for i in range(0,10):
center = x[-1] * np.random.random() - x[0]
amp = 60 * np.random.random() + 10
width = 10 * np.random.random() + 5
desired_peaks += amp * np.e**(-(x-center)**2/(2*width**2))
# Also make background peaks (not desired)
for i in range(0,3):
center = x[-1] * np.random.random() - x[0]
amp = 40 * np.random.random() + 10
width = 100 * np.random.random() + 100
bkg_peaks += amp * np.e**(-(x-center)**2/(2*width**2))
return bkg_peaks, desired_peaks
x = np.array(range(0, 1000))
bkg_peaks, desired_peaks = make_peaks(x)
y_noise = np.random.normal(loc=30, scale=10, size=len(x))
y = bkg_peaks + desired_peaks + y_noise
xy = np.array( zip(x,y), dtype=[('x',float), ('y',float)])
# Compute the wavelet transform
# I can't figure out what the width is or does?
widths = range(1,11)
# Ricker is 2nd derivative of Gaussian
# (*close* to what *most* of the features are in my data)
# (They're actually Lorentzians and Breit-Wigner-Fano lines)
cwtmatr = signal.cwt(xy['y'], signal.ricker, widths)
# Maybe we multiple by the original data? and square?
WT = (xy['y'] * cwtmatr)**2
# plot the data and results
fig = plt.figure()
ax_raw_data = fig.add_subplot(4,3,1)
ax = {}
for i in range(0, 11):
ax[i] = fig.add_subplot(4,3, i+2)
ax_desired_transformed_data = fig.add_subplot(4,3,12)
ax_raw_data.plot(xy['x'], xy['y'], 'g-')
for i in range(0,10):
ax[i].plot(xy['x'], WT[i])
ax_desired_transformed_data.plot(xy['x'], desired_peaks, 'k-')
fig.tight_layout()
plt.show()
This script will output this image:
Where the first plot is the raw data, the middle plots are the wavelet transforms and the last plot is what I want to get out as the processed (background and noise removed) data.
Does anyone have any suggestions? Thank you so much for the help.
I ended up finding a package which provides an inverse wavelet transform function called mlpy. The function is mlpy.wavelet.uwt. This is the compilable script I ended up with which may interest people if they are trying to do noise or background removal:
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
import mlpy.wavelet as wave
# Make some random data with peaks and noise
############################################################
def gen_data():
def make_peaks(x):
bkg_peaks = np.array(np.zeros(len(x)))
desired_peaks = np.array(np.zeros(len(x)))
# Make peaks which contain the data desired
# (Mid range/frequency peaks)
for i in range(0,10):
center = x[-1] * np.random.random() - x[0]
amp = 100 * np.random.random() + 10
width = 10 * np.random.random() + 5
desired_peaks += amp * np.e**(-(x-center)**2/(2*width**2))
# Also make background peaks (not desired)
for i in range(0,3):
center = x[-1] * np.random.random() - x[0]
amp = 80 * np.random.random() + 10
width = 100 * np.random.random() + 100
bkg_peaks += amp * np.e**(-(x-center)**2/(2*width**2))
return bkg_peaks, desired_peaks
# make x axis
x = np.array(range(0, 1000))
bkg_peaks, desired_peaks = make_peaks(x)
avg_noise_level = 30
std_dev_noise = 10
size = len(x)
scattering_noise_amp = 100
scat_center = 100
scat_width = 15
scat_std_dev_noise = 100
y_scattering_noise = np.random.normal(scattering_noise_amp, scat_std_dev_noise, size) * np.e**(-(x-scat_center)**2/(2*scat_width**2))
y_noise = np.random.normal(avg_noise_level, std_dev_noise, size) + y_scattering_noise
y = bkg_peaks + desired_peaks + y_noise
xy = np.array( zip(x,y), dtype=[('x',float), ('y',float)])
return xy
# Random data Generated
#############################################################
xy = gen_data()
# Make 2**n amount of data
new_y, bool_y = wave.pad(xy['y'])
orig_mask = np.where(bool_y==True)
# wavelet transform parameters
levels = 8
wf = 'h'
k = 2
# Remove Noise first
# Wave transform
wt = wave.uwt(new_y, wf, k, levels)
# Matrix of the difference between each wavelet level and the original data
diff_array = np.array([(wave.iuwt(wt[i:i+1], wf, k)-new_y) for i in range(len(wt))])
# Index of the level which is most similar to original data (to obtain smoothed data)
indx = np.argmin(np.sum(diff_array**2, axis=1))
# Use the wavelet levels around this region
noise_wt = wt[indx:indx+1]
# smoothed data in 2^n length
new_y = wave.iuwt(noise_wt, wf, k)
# Background Removal
error = 10000
errdiff = 100
i = -1
iter_y_dict = {0:np.copy(new_y)}
bkg_approx_dict = {0:np.array([])}
while abs(errdiff)>=1*10**-24:
i += 1
# Wave transform
wt = wave.uwt(iter_y_dict[i], wf, k, levels)
# Assume last slice is lowest frequency (background approximation)
bkg_wt = wt[-3:-1]
bkg_approx_dict[i] = wave.iuwt(bkg_wt, wf, k)
# Get the error
errdiff = error - sum(iter_y_dict[i] - bkg_approx_dict[i])**2
error = sum(iter_y_dict[i] - bkg_approx_dict[i])**2
# Make every peak higher than bkg_wt
diff = (new_y - bkg_approx_dict[i])
peak_idxs_to_remove = np.where(diff>0.)[0]
iter_y_dict[i+1] = np.copy(new_y)
iter_y_dict[i+1][peak_idxs_to_remove] = np.copy(bkg_approx_dict[i])[peak_idxs_to_remove]
# new data without noise and background
new_y = new_y[orig_mask]
bkg_approx = bkg_approx_dict[len(bkg_approx_dict.keys())-1][orig_mask]
new_data = diff[orig_mask]
##############################################################
# plot the data and results
fig = plt.figure()
ax_raw_data = fig.add_subplot(121)
ax_WT = fig.add_subplot(122)
ax_raw_data.plot(xy['x'], xy['y'], 'g')
for bkg in bkg_approx_dict.values():
ax_raw_data.plot(xy['x'], bkg[orig_mask], 'k')
ax_WT.plot(xy['x'], new_data, 'y')
fig.tight_layout()
plt.show()
And here is the output I am getting now:
As you can see, there is still a problem with the background removal (it shifts to the right after each iteration), but it is a different question which I will address here.