I'm trying to apply the findpeaks method offered by Matlab on a Python project in order to achieve the same results.
On Internet, I retrieved many algorithms to find peaks in python but the best source I found out is the following one -> https://github.com/MonsieurV/py-findpeaks
However, this didn't solve my problem.
In Matlab, I have this line of code:
[pks, locs] = findpeaks(a, 'MINPEAKDISTANCE', 72)
Hence, i tried out initially with the method offered by peakutils.indexes, in the following way :
locs= peakutils.indexes(y=a, thres=0, min_dist=72)
for val in locs:
pks.append(a[val])
I am not really sure about 'thres=0' but in matlab the default value of threshold is 0, even if it seems intended in a different way with respect to peakutils.indexes.
The problem is that in the Matlab case I got 6635 peaks while in peakutils.indexes I got 6630 peaks (I am working on the signal 108 from MIT-BIH ARRHYTHMIA DATABASE offered by PhysioNet) . Moreover, some of them are not equals, that is in Matlab maybe one peak is located at 155 while in Python it is located at 158, and this, even if it is a small difference, causes problems in my algorithm.
I am actually working on this version of the pan and tompkins algorithm for ecg signal analysis-> https://it.mathworks.com/matlabcentral/fileexchange/45840-complete-pan-tompkins-implementation-ecg-qrs-detector
some time ago I was facing the same problem and I found this function that worked just fine. It's a Matlab equivalent, try out and let us know if it worked for you. The code is not mine.
# %load ./../functions/detect_peaks.py
"""Detect peaks in data based on their amplitude and other features."""
from __future__ import division, print_function
import numpy as np
__author__ = "Marcos Duarte, https://github.com/demotu/BMC"
__version__ = "1.0.4"
__license__ = "MIT"
def detect_peaks(x, mph=None, mpd=1, threshold=0, edge='rising',
kpsh=False, valley=False, show=False, ax=None):
"""Detect peaks in data based on their amplitude and other features.
Parameters
----------
x : 1D array_like
data.
mph : {None, number}, optional (default = None)
detect peaks that are greater than minimum peak height.
mpd : positive integer, optional (default = 1)
detect peaks that are at least separated by minimum peak distance (in
number of data).
threshold : positive number, optional (default = 0)
detect peaks (valleys) that are greater (smaller) than `threshold`
in relation to their immediate neighbors.
edge : {None, 'rising', 'falling', 'both'}, optional (default = 'rising')
for a flat peak, keep only the rising edge ('rising'), only the
falling edge ('falling'), both edges ('both'), or don't detect a
flat peak (None).
kpsh : bool, optional (default = False)
keep peaks with same height even if they are closer than `mpd`.
valley : bool, optional (default = False)
if True (1), detect valleys (local minima) instead of peaks.
show : bool, optional (default = False)
if True (1), plot data in matplotlib figure.
ax : a matplotlib.axes.Axes instance, optional (default = None).
Returns
-------
ind : 1D array_like
indeces of the peaks in `x`.
Notes
-----
The detection of valleys instead of peaks is performed internally by simply
negating the data: `ind_valleys = detect_peaks(-x)`
The function can handle NaN's
See this IPython Notebook [1]_.
References
----------
.. [1] http://nbviewer.ipython.org/github/demotu/BMC/blob/master/notebooks/DetectPeaks.ipynb
Examples
--------
>>> from detect_peaks import detect_peaks
>>> x = np.random.randn(100)
>>> x[60:81] = np.nan
>>> # detect all peaks and plot data
>>> ind = detect_peaks(x, show=True)
>>> print(ind)
>>> x = np.sin(2*np.pi*5*np.linspace(0, 1, 200)) + np.random.randn(200)/5
>>> # set minimum peak height = 0 and minimum peak distance = 20
>>> detect_peaks(x, mph=0, mpd=20, show=True)
>>> x = [0, 1, 0, 2, 0, 3, 0, 2, 0, 1, 0]
>>> # set minimum peak distance = 2
>>> detect_peaks(x, mpd=2, show=True)
>>> x = np.sin(2*np.pi*5*np.linspace(0, 1, 200)) + np.random.randn(200)/5
>>> # detection of valleys instead of peaks
>>> detect_peaks(x, mph=0, mpd=20, valley=True, show=True)
>>> x = [0, 1, 1, 0, 1, 1, 0]
>>> # detect both edges
>>> detect_peaks(x, edge='both', show=True)
>>> x = [-2, 1, -2, 2, 1, 1, 3, 0]
>>> # set threshold = 2
>>> detect_peaks(x, threshold = 2, show=True)
"""
x = np.atleast_1d(x).astype('float64')
if x.size < 3:
return np.array([], dtype=int)
if valley:
x = -x
# find indices of all peaks
dx = x[1:] - x[:-1]
# handle NaN's
indnan = np.where(np.isnan(x))[0]
if indnan.size:
x[indnan] = np.inf
dx[np.where(np.isnan(dx))[0]] = np.inf
ine, ire, ife = np.array([[], [], []], dtype=int)
if not edge:
ine = np.where((np.hstack((dx, 0)) < 0) & (np.hstack((0, dx)) > 0))[0]
else:
if edge.lower() in ['rising', 'both']:
ire = np.where((np.hstack((dx, 0)) <= 0) & (np.hstack((0, dx)) > 0))[0]
if edge.lower() in ['falling', 'both']:
ife = np.where((np.hstack((dx, 0)) < 0) & (np.hstack((0, dx)) >= 0))[0]
ind = np.unique(np.hstack((ine, ire, ife)))
# handle NaN's
if ind.size and indnan.size:
# NaN's and values close to NaN's cannot be peaks
ind = ind[np.in1d(ind, np.unique(np.hstack((indnan, indnan-1, indnan+1))), invert=True)]
# first and last values of x cannot be peaks
if ind.size and ind[0] == 0:
ind = ind[1:]
if ind.size and ind[-1] == x.size-1:
ind = ind[:-1]
# remove peaks < minimum peak height
if ind.size and mph is not None:
ind = ind[x[ind] >= mph]
# remove peaks - neighbors < threshold
if ind.size and threshold > 0:
dx = np.min(np.vstack([x[ind]-x[ind-1], x[ind]-x[ind+1]]), axis=0)
ind = np.delete(ind, np.where(dx < threshold)[0])
# detect small peaks closer than minimum peak distance
if ind.size and mpd > 1:
ind = ind[np.argsort(x[ind])][::-1] # sort ind by peak height
idel = np.zeros(ind.size, dtype=bool)
for i in range(ind.size):
if not idel[i]:
# keep peaks with the same height if kpsh is True
idel = idel | (ind >= ind[i] - mpd) & (ind <= ind[i] + mpd) \
& (x[ind[i]] > x[ind] if kpsh else True)
idel[i] = 0 # Keep current peak
# remove the small peaks and sort back the indices by their occurrence
ind = np.sort(ind[~idel])
if show:
if indnan.size:
x[indnan] = np.nan
if valley:
x = -x
_plot(x, mph, mpd, threshold, edge, valley, ax, ind)
return ind
def _plot(x, mph, mpd, threshold, edge, valley, ax, ind):
"""Plot results of the detect_peaks function, see its help."""
try:
import matplotlib.pyplot as plt
except ImportError:
print('matplotlib is not available.')
else:
if ax is None:
_, ax = plt.subplots(1, 1, figsize=(8, 4))
ax.plot(x, 'b', lw=1)
if ind.size:
label = 'valley' if valley else 'peak'
label = label + 's' if ind.size > 1 else label
ax.plot(ind, x[ind], '+', mfc=None, mec='r', mew=2, ms=8,
label='%d %s' % (ind.size, label))
ax.legend(loc='best', framealpha=.5, numpoints=1)
ax.set_xlim(-.02*x.size, x.size*1.02-1)
ymin, ymax = x[np.isfinite(x)].min(), x[np.isfinite(x)].max()
yrange = ymax - ymin if ymax > ymin else 1
ax.set_ylim(ymin - 0.1*yrange, ymax + 0.1*yrange)
ax.set_xlabel('Data #', fontsize=14)
ax.set_ylabel('Amplitude', fontsize=14)
mode = 'Valley detection' if valley else 'Peak detection'
ax.set_title("%s (mph=%s, mpd=%d, threshold=%s, edge='%s')"
% (mode, str(mph), mpd, str(threshold), edge))
# plt.grid()
plt.show()
Just pass your data without the for loop. It should find all the picks. The following should work:
peaks = peakutils.indexes(data, thres=10/max(data), min_dist=20)
where data is an array of float64. Maybe try to play with the threshold. You should also make sure the min_dist is smaller than peaks distance.
Good luck.
Related
I have a random distribution of points in 2D space like so:
from sklearn import datasets
import pandas as pd
import numpy as np
arr, labels = datasets.make_moons()
arr, labels = datasets.make_blobs(n_samples=1000, centers=3)
pd.DataFrame(arr, columns=['x', 'y']).plot.scatter('x', 'y', s=1)
I'm trying to assign each of these points to the nearest unoccupied slot on an imaginary hex grid to ensure the points don't overlap. The code I'm using to accomplish this goal produces the plot below. The general idea is to create the hex bins, then iterate over each point and find the minimal radius that allows the algorithm to assign that point to an unoccupied hex bin:
from scipy.spatial.distance import euclidean
def get_bounds(arr):
'''Given a 2D array return the y_min, y_max, x_min, x_max'''
return [
np.min(arr[:,1]),
np.max(arr[:,1]),
np.min(arr[:,0]),
np.max(arr[:,0]),
]
def create_mesh(arr, h=100, w=100):
'''arr is a 2d array; h=number of vertical divisions; w=number of horizontal divisions'''
print(' * creating mesh with size', h, w)
bounds = get_bounds(arr)
# create array of valid positions
y_vals = np.arange(bounds[0], bounds[1], (bounds[1]-bounds[0])/h)
x_vals = np.arange(bounds[2], bounds[3], (bounds[3]-bounds[2])/w)
# create the dense mesh
data = np.tile(
[[0, 1], [1, 0]],
np.array([
int(np.ceil(len(y_vals) / 2)),
int(np.ceil(len(x_vals) / 2)),
]))
# ensure each axis has an even number of slots
if len(y_vals) % 2 != 0 or len(x_vals) % 2 != 0:
data = data[0:len(y_vals), 0:len(x_vals)]
return pd.DataFrame(data, index=y_vals, columns=x_vals)
def align_points_to_grid(arr, h=100, w=100, verbose=False):
'''arr is a 2d array; h=number of vertical divisions; w=number of horizontal divisions'''
h = w = len(arr)/10
grid = create_mesh(arr, h=h, w=w)
# fill the mesh
print(' * filling mesh')
df = pd.DataFrame(arr, columns=['x', 'y'])
bounds = get_bounds(arr)
# store the number of points slotted
c = 0
for site, point in df[['x', 'y']].iterrows():
# skip points not in original points domain
if point.y < bounds[0] or point.y > bounds[1] or \
point.x < bounds[2] or point.x > bounds[3]:
raise Exception('Input point is out of bounds', point.x, point.y, bounds)
# assign this point to the nearest open slot
r_y = (bounds[1]-bounds[0])/h
r_x = (bounds[3]-bounds[2])/w
slotted = False
while not slotted:
bottom = grid.index.searchsorted(point.y - r_y)
top = grid.index.searchsorted(point.y + r_y, side='right')
left = grid.columns.searchsorted(point.x - r_x)
right = grid.columns.searchsorted(point.x + r_x, side='right')
close_grid_points = grid.iloc[bottom:top, left:right]
# store the position in this point's radius that minimizes distortion
best_dist = np.inf
grid_loc = [np.nan, np.nan]
for x, col in close_grid_points.iterrows():
for y, val in col.items():
if val != 1: continue
dist = euclidean(point, (x,y))
if dist < best_dist:
best_dist = dist
grid_loc = [x,y]
# no open slots were found so increase the search radius
if np.isnan(grid_loc[0]):
r_y *= 2
r_x *= 2
# success! report the slotted position to the user
else:
# assign a value other than 1 to mark this slot as filled
grid.loc[grid_loc[0], grid_loc[1]] = 2
df.loc[site, ['x', 'y']] = grid_loc
slotted = True
c += 1
if verbose:
print(' * completed', c, 'of', len(arr), 'assignments')
return df
# plot sample data
df = align_points_to_grid(arr, verbose=False)
df.plot.scatter('x', 'y', s=1)
I'm satisfied with the result of this algorithm, but not with the performance.
Is there a faster solution to this kind of hexbin assignment problem in Python? I feel others with more exposure to the Linear Assignment Problem or the Hungarian Algorithm might have valuable insight into this problem. Any suggestions would be hugely helpful!
It turned out assigning each point to the first available grid spot within its current radius was sufficiently performant.
For others who end up here, my lab wrapped this functionality into a little Python package for convenience. You can pip install pointgrid and then:
from pointgrid import align_points_to_grid
from sklearn import datasets
# create fake data
arr, labels = datasets.make_blobs(n_samples=1000, centers=5)
# get updated point positions
updated = align_points_to_grid(arr)
updated will be a numpy array with the same shape as the input array arr.
How can a figure using a rainbow colormap, such as figure 1, be converted so that the same data are displayed using a different color map, such as a perceptually uniform sequential map?
Assume that the underlying data from which the original image was generated are not accessible and the image itself must be recolored using only information within the image.
Background information: rainbow color maps tend to produce visual artifacts. See the cyan line near z = -1.15 m? It looks like there's a sharp edge there. But look at the colorbar itself! Even the color bar has an edge there. There's another fake edge in the yellow band that goes vertically near R = 1.45 m. The horizontal yellow stripe may be a real edge in the underlying data, although it's difficult to distinguish that case from a rainbow artifact.
More information:
http://ieeexplore.ieee.org/abstract/document/4118486/
http://matplotlib.org/users/colormaps.html
Here is my best solution so far:
import numpy as np
import scipy
import os
import matplotlib
import copy
import matplotlib.pyplot as plt
from matplotlib.pyplot import imread, imsave
def_colorbar_loc = [[909, 22], [953 - 20, 959]]
def_working_loc = [[95, 189], [857, 708]]
def recolor_image(
filename='image.png',
colorbar_loc=def_colorbar_loc,
working_loc=def_working_loc,
colorbar_orientation='auto',
colorbar_direction=-1,
new_cmap='viridis',
normalize_before_compare=False,
max_rgb='auto',
threshold=0.4,
saturation_threshold=0.25,
compare_hue=True,
show_plot=True,
debug=False,
):
"""
This script reads in an image file (like .png), reads the image's color bar (you have to tell it where), interprets
the color map used in the image to convert colors to values, then recolors those values with a new color map and
regenerates the figure. Useful for fixing figures that were made with rainbow color maps.
Parameters
-----------
:param filename: Full path and filename of the image file.
:param colorbar_loc: Location of color bar, which will be used to analyze the image and convert colors into values.
Pixels w/ 0,0 at top left corner: [[left, top], [right, bottom]]
:param working_loc: Location of the area to recolor. You don't have to recolor the whole image.
Pixels w/ 0,0 at top left corner: [[left, top], [right, bottom]], set to [[0, 0], [-1, -1]] to do everything.
:param colorbar_orientation: Set to 'x', 'y', or 'auto' to specify whether color map is horizontal, vertical,
or should be determined based on the dimensions of the colorbar_loc
:param colorbar_direction: Controls direction of ascending value
+1: colorbar goes from top to bottom or left to right.
-1: colorbar goes from bottom to top or right to left.
:param new_cmap: String describing the new color map to use in the recolored image.
:param normalize_before_compare: Divide r, g, and b each by (r+g+b) before comparing.
:param max_rgb: Do the values of r, g, and b range from 0 to 1 or from 0 to 255? Set to 1, 255, or 'auto'.
:param threshold: Sum of absolute differences in r, g, b values must be less than threshold to be valid
(0 = perfect, 3 = impossibly bad). Higher numbers = less chance of missing pixels but more chance of recoloring
plot axes, etc.
:param saturation_threshold: Minimum color saturation below which no replacement will take place
:param compare_hue: Use differences in HSV instead of RGB to determine with which index each pixel should be
associated.
:param show_plot: T/F: Open a plot to explain what is going on. Also helpful for checking your aim on the colorbar
coordinates and debugging.
:param debug: T/F: Print debugging information.
"""
def printd(string_in):
"""
Prints debugging statements
:param string_in: String to print only if debug is on.
:return: None
"""
if debug:
print(string_in)
return
print('Recoloring image: {:} ...'.format(filename))
# Determine tag name and load original file into the tree
fn1 = filename.split(os.sep)[-1] # Filename without path
fn2 = fn1.split(os.extsep)[0] # Filename without extension (so new filename can be built later)
ext = fn1.split(os.extsep)[-1] # File extension
path = os.sep.join(filename.split(os.sep)[0:-1]) # Path; used later to save results.
a = imread(filename).astype(float)
printd(f'Read image; shape = {np.shape(a)}')
if max_rgb == 'auto':
# Determine if values of R, G, and B range from 0 to 1 or from 0 to 255
if a.max() > 1:
max_rgb = 255.0
else:
max_rgb = 1.0
# Normalize a so RGB values go from 0 to 1 and are floats.
a /= max_rgb
# Extract the colorbar
x = np.array([colorbar_loc[0][0], colorbar_loc[1][0]])
y = np.array([colorbar_loc[0][1], colorbar_loc[1][1]])
cb = a[y[0]:y[1], x[0]:x[1]]
# Take just the working area, not the whole image
xw = np.array([working_loc[0][0], working_loc[1][0]])
yw = np.array([working_loc[0][1], working_loc[1][1]])
a1 = a[yw[0]:yw[1], xw[0]:xw[1]]
# Pick color bar orientation
if colorbar_orientation == 'auto':
if np.diff(x) > np.diff(y):
colorbar_orientation = 'x'
else:
colorbar_orientation = 'y'
printd('Auto selected colorbar_orientation')
printd('Colorbar orientation is {:}'.format(colorbar_orientation))
# Analyze the colorbar
if colorbar_orientation == 'y':
cb = np.nanmean(cb, axis=1)
else:
cb = np.nanmean(cb, axis=0)
if colorbar_direction < 0:
cb = cb[::-1]
# Compress colorbar to only count unique colors
# If the array gets too big, it will fill memory and crash python: https://github.com/numpy/numpy/issues/14136
dcb = np.append(1, np.sum(abs(np.diff(cb[:, 0:3], axis=0)), axis=1))
cb = cb[dcb > 0]
# Find and mask of special colors that should not be recolored
n1a = np.sum(a1[:, :, 0:3], axis=2)
replacement_mask = np.ones(np.shape(n1a), bool)
for col in [0, 3]: # Black and white will come out as 0 and 3.
mask_update = n1a != col
if mask_update.max() == 0:
print('Warning: masking to protect special colors prevented all changes to the image!')
else:
printd('Good: Special color mask {:} allowed at least some changes'.format(col))
replacement_mask *= mask_update
if replacement_mask.max() == 0:
print('Warning: replacement mask will prevent all changes to the image! '
'(Reached this point during special color protection)')
printd('Sum(replacement_mask) = {:} (after considering special color {:})'
.format(np.sum(np.atleast_1d(replacement_mask)), col))
# Also apply limits to total r+g+b
replacement_mask *= n1a > 0.75
replacement_mask *= n1a < 2.5
if replacement_mask.max() == 0:
print('Warning: replacement mask will prevent all changes to the image! '
'(Reached this point during total r+g+b+ limits)')
printd('Sum(replacement_mask) = {:} (after considering r+g+b upper threshold)'
.format(np.sum(np.atleast_1d(replacement_mask))))
if saturation_threshold > 0:
hsv1 = matplotlib.colors.rgb_to_hsv(a1[:, :, 0:3])
sat = hsv1[:, :, 1]
printd('Saturation ranges from {:} <= sat <= {:}'.format(sat.min(), sat.max()))
sat_mask = sat > saturation_threshold
if sat_mask.max() == 0:
print('Warning: saturation mask will prevent all changes to the image!')
else:
printd('Good: Saturation mask will allow at least some changes')
replacement_mask *= sat_mask
if replacement_mask.max() == 0:
print('Warning: replacement mask will prevent all changes to the image! '
'(Reached this point during saturation threshold)')
printd(f'shape(a1) = {np.shape(a)}')
printd(f'shape(cb) = {np.shape(cb)}')
# Find where on the colorbar each pixel sits
if compare_hue:
# Difference in hue
hsv1 = matplotlib.colors.rgb_to_hsv(a1[:, :, 0:3])
hsv_cb = matplotlib.colors.rgb_to_hsv(cb[:, 0:3])
d2 = abs(hsv1[:, :, :, np.newaxis] - hsv_cb.T[np.newaxis, np.newaxis, :, :])
# d2 = d2[:, :, 0, :] # Take hue only
d2 = np.sum(d2, axis=2)
printd(' shape(d2) = {:} (hue version)'.format(np.shape(d2)))
else:
# Difference in RGB
if normalize_before_compare:
# Difference of normalized RGB arrays
n1 = n1a[:, :, np.newaxis]
n2 = np.sum(cb[:, 0:3], axis=1)[:, np.newaxis]
w1 = n1 == 0
w2 = n2 == 0
n1[w1] = 1
n2[w2] = 1
d = (a1/n1)[:, :, 0:3, np.newaxis] - (cb/n2).T[np.newaxis, np.newaxis, 0:3, :]
else:
# Difference of non-normalized RGB arrays
d = (a1[:, :, 0:3, np.newaxis] - cb.T[np.newaxis, np.newaxis, 0:3, :])
printd(f'Shape(d) = {np.shape(d)}')
d2 = np.sum(np.abs(d[:, :, 0:3, :]), axis=2) # 0:3 excludes the alpha channel from this calculation
printd('Processed colorbar')
index = d2.argmin(axis=2)
md2 = d2.min(axis=2)
index_valid = md2 < threshold
if index_valid.max() == 0:
print('Warning: minimum difference is greater than threshold: all changes rejected!')
else:
printd('Good: Minimum difference filter is lower than threshold for at least one pixel.')
printd('Sum(index_valid) = {:} (before *= replacement_mask)'.format(np.sum(np.atleast_1d(index_valid))))
printd('Sum(replacement_mask) = {:} (final, before combining w/ index_valid)'
.format(np.sum(np.atleast_1d(replacement_mask))))
index_valid *= replacement_mask
if index_valid.max() == 0:
print('Warning: index_valid mask prevents all changes to the image after combination w/ replacement_mask.')
else:
printd('Good: Mask will allow at least one pixel to change.')
printd('Sum(index_valid) = {:}'.format(np.sum(np.atleast_1d(index_valid))))
value = index/(len(cb)-1.0)
printd('Index ranges from {:} to {:}'.format(index.min(), index.max()))
# Make a new image with replaced colors
b = matplotlib.cm.ScalarMappable(cmap=new_cmap).to_rgba(value) # Remap everything
printd('shape(b) = {:}, min(b) = {:}, max(b) = {:}'.format(np.shape(b), b.min(), b.max()))
c = copy.copy(a1) # Copy original
c[index_valid] = b[index_valid] # Transfer only pixels where color was close to colormap
# Transfer working area to full image
c2 = copy.copy(a) # Copy original full image
c2[yw[0]:yw[1], xw[0]:xw[1], :] = c # Replace working area
c2[:, :, 3] = a[:, :, 3] # Preserve original alpha channel
# Save the image in the same path as the original but with _recolored added to the filename.
new_filename = '{:}{:}{:}_recolored{:}{:}'.format(path, os.sep, fn2, os.extsep, ext)
imsave(new_filename, c2)
print('Done recoloring. Result saved to {:} .'.format(new_filename))
if show_plot:
# Setup figure for showing things to the user
f, axs = plt.subplots(2, 3)
axo = axs[0, 0] # Axes for original figure
axoc = axs[0, 1] # Axes for original color bar
axf = axs[0, 2] # Axes for final figure
axm = axs[1, 1] # Axes for mask
axre = axs[1, 2] # Axes for recolored section only (it might not be the whole figure)
axraw = axs[1, 0] # Axes for raw recoloring result before masking
for ax in axs.flatten():
ax.set_xlabel('x pixel')
ax.set_ylabel('y pixel')
axo.set_title('Original image w/ colorbar ID overlay')
axoc.set_title('Color progression from original colorbar')
axm.set_title('Mask')
axre.set_title('Recolored section')
axraw.set_title('Raw recolor result (no masking)')
axf.set_title('Final image')
axoc.set_xlabel('Index')
axoc.set_ylabel('Value')
# Show the user where they placed the color bar and working location
axo.imshow(a)
xx = x[np.array([0, 0, 1, 1, 0])]
yy = y[np.array([0, 1, 1, 0, 0])]
axo.plot(xx, yy, '+-', label='colorbar')
xxw = xw[np.array([0, 0, 1, 1, 0])]
yyw = yw[np.array([0, 1, 1, 0, 0])]
axo.plot(xxw, yyw, '+-', label='target')
tots = np.sum(cb[:, 0:3], axis=1)
if normalize_before_compare:
# Normalized version
axoc.plot(cb[:, 0] / tots, 'r', label='r/(r+g+b)', lw=2)
axoc.plot(cb[:, 1] / tots, 'g', label='g/(r+g+b)', lw=2)
axoc.plot(cb[:, 2] / tots, 'b', label='b/(r+g+b)', lw=2)
axoc.set_ylabel('Normalized value')
else:
axoc.plot(cb[:, 0], 'r', label='r', lw=2)
axoc.plot(cb[:, 1], 'g', label='g', lw=2)
axoc.plot(cb[:, 2], 'b', label='b', lw=2)
axoc.plot(cb[:, 3], color='gray', linestyle='--', label='$\\alpha$')
axoc.plot(tots, 'k', label='r+g+b')
# Display the new colors with no mask, the mask, and the recolored section
axraw.imshow(b)
axm.imshow(index_valid)
axre.imshow(c)
# Display the final result
axf.imshow(c2)
# Finishing touches on plots
axo.legend(loc=0).set_draggable(True)
axoc.legend(loc=0).set_draggable(True)
plt.show()
return
Suppose I have a 3D numpy array of nonzero values and "background" = 0. As an example I will take a sphere of random values:
array = np.random.randint(1, 5, size = (100,100,100))
z,y,x = np.ogrid[-50:50, -50:50, -50:50]
mask = x**2 + y**2 + z**2<= 20**2
array[np.invert(mask)] = 0
First, I would like to find the "border voxels" (all nonzero values that have a zero within their 3x3x3 neigbourhood). Second, I would like to replace all border voxels with the mean of their nonzero neighbours. So far I tried to use scipy's generic filter in the following way:
Function to apply at each element:
def borderCheck(values):
#check if the footprint center is on a nonzero value
if values[13] != 0:
#replace border voxels with the mean of nonzero neighbours
if 0 in values:
return np.sum(values)/np.count_nonzero(values)
else:
return values[13]
else:
return 0
Generic filter:
from scipy import ndimage
result = ndimage.generic_filter(array, borderCheck, footprint = np.ones((3,3,3)))
Is this a proper way to handle this problem? I feel that I am trying to reinvent the wheel here and that there must be a shorter, nicer way to achieve the result. Are there any other suitable (numpy, scipy ) functions that I can use?
EDIT
I messed one thing up: I would like to replace all border voxels with the mean of their nonzero AND non-border neighbours. For this, I tried to clean up the neighbours from ali_m's code (2D case):
#for each neighbour voxel, check whether it also appears in the border/edges
non_border_neighbours = []
for each in neighbours:
non_border_neighbours.append([i for i in each if nonzero_idx[i] not in edge_idx])
Now I can't figure out why non_border_neighbours comes back empty?
Furthermore, correct me if I am wrong but doesn't tree.query_ball_point with radius 1 adress only the 6 next neighbours (euclidean distance 1)? Should I set sqrt(3) (3D case) as radius to get the 26-neighbourhood?
I think it's best to start out with the 2D case first, since it can be visualized much more easily:
import numpy as np
from matplotlib import pyplot as plt
A = np.random.randint(1, 5, size=(100, 100)).astype(np.double)
y, x = np.ogrid[-50:50, -50:50]
mask = x**2 + y**2 <= 30**2
A[~mask] = 0
To find the edge pixels you could perform binary erosion on your mask, then XOR the result with your mask
# rank 2 structure with full connectivity
struct = ndimage.generate_binary_structure(2, 2)
erode = ndimage.binary_erosion(mask, struct)
edges = mask ^ erode
One approach to find the nearest non-zero neighbours of each edge pixel would be to use a scipy.spatial.cKDTree:
from scipy.spatial import cKDTree
# the indices of the non-zero locations and their corresponding values
nonzero_idx = np.vstack(np.where(mask)).T
nonzero_vals = A[mask]
# build a k-D tree
tree = cKDTree(nonzero_idx)
# use it to find the indices of all non-zero values that are at most 1 pixel
# away from each edge pixel
edge_idx = np.vstack(np.where(edges)).T
neighbours = tree.query_ball_point(edge_idx, r=1, p=np.inf)
# take the average value for each set of neighbours
new_vals = np.hstack(np.mean(nonzero_vals[n]) for n in neighbours)
# use these to replace the values of the edge pixels
A_new = A.astype(np.double, copy=True)
A_new[edges] = new_vals
Some visualisation:
fig, ax = plt.subplots(1, 3, figsize=(10, 4), sharex=True, sharey=True)
norm = plt.Normalize(0, A.max())
ax[0].imshow(A, norm=norm)
ax[0].set_title('Original', fontsize='x-large')
ax[1].imshow(edges)
ax[1].set_title('Edges', fontsize='x-large')
ax[2].imshow(A_new, norm=norm)
ax[2].set_title('Averaged', fontsize='x-large')
for aa in ax:
aa.set_axis_off()
ax[0].set_xlim(20, 50)
ax[0].set_ylim(50, 80)
fig.tight_layout()
plt.show()
This approach will also generalize to the 3D case:
B = np.random.randint(1, 5, size=(100, 100, 100)).astype(np.double)
z, y, x = np.ogrid[-50:50, -50:50, -50:50]
mask = x**2 + y**2 + z**2 <= 20**2
B[~mask] = 0
struct = ndimage.generate_binary_structure(3, 3)
erode = ndimage.binary_erosion(mask, struct)
edges = mask ^ erode
nonzero_idx = np.vstack(np.where(mask)).T
nonzero_vals = B[mask]
tree = cKDTree(nonzero_idx)
edge_idx = np.vstack(np.where(edges)).T
neighbours = tree.query_ball_point(edge_idx, r=1, p=np.inf)
new_vals = np.hstack(np.mean(nonzero_vals[n]) for n in neighbours)
B_new = B.astype(np.double, copy=True)
B_new[edges] = new_vals
Test against your version:
def borderCheck(values):
#check if the footprint center is on a nonzero value
if values[13] != 0:
#replace border voxels with the mean of nonzero neighbours
if 0 in values:
return np.sum(values)/np.count_nonzero(values)
else:
return values[13]
else:
return 0
result = ndimage.generic_filter(B, borderCheck, footprint=np.ones((3, 3, 3)))
print(np.allclose(B_new, result))
# True
I'm sure this isn't the most efficient way to do it, but it will still be significantly faster than using generic_filter.
Update
The performance could be further improved by reducing the number of points that are considered as candidate neighbours of the edge pixels/voxels:
# ...
# the edge pixels/voxels plus their immediate non-zero neighbours
erode2 = ndimage.binary_erosion(erode, struct)
candidate_neighbours = mask ^ erode2
nonzero_idx = np.vstack(np.where(candidate_neighbours)).T
nonzero_vals = B[candidate_neighbours]
# ...
There is an array containing 3D data of shape e.g. (64,64,64), how do you plot a plane given by a point and a normal (similar to hkl planes in crystallography), through this dataset?
Similar to what can be done in MayaVi by rotating a plane through the data.
The resulting plot will contain non-square planes in most cases.
Can those be done with matplotlib (some sort of non-rectangular patch)?
Edit: I almost solved this myself (see below) but still wonder how non-rectangular patches can be plotted in matplotlib...?
Edit: Due to discussions below I restated the question.
This is funny, a similar question I replied to just today. The way to go is: interpolation. You can use griddata from scipy.interpolate:
Griddata
This page features a very nice example, and the signature of the function is really close to your data.
You still have to somehow define the points on you plane for which you want to interpolate the data. I will have a look at this, my linear algebra lessons where a couple of years ago
I have the penultimate solution for this problem. Partially solved by using the second answer to Plot a plane based on a normal vector and a point in Matlab or matplotlib :
# coding: utf-8
import numpy as np
from matplotlib.pyplot import imshow,show
A=np.empty((64,64,64)) #This is the data array
def f(x,y):
return np.sin(x/(2*np.pi))+np.cos(y/(2*np.pi))
xx,yy= np.meshgrid(range(64), range(64))
for x in range(64):
A[:,:,x]=f(xx,yy)*np.cos(x/np.pi)
N=np.zeros((64,64))
"""This is the plane we cut from A.
It should be larger than 64, due to diagonal planes being larger.
Will be fixed."""
normal=np.array([-1,-1,1]) #Define cut plane here. Normal vector components restricted to integers
point=np.array([0,0,0])
d = -np.sum(point*normal)
def plane(x,y): # Get plane's z values
return (-normal[0]*x-normal[1]*y-d)/normal[2]
def getZZ(x,y): #Get z for all values x,y. If z>64 it's out of range
for i in x:
for j in y:
if plane(i,j)<64:
N[i,j]=A[i,j,plane(i,j)]
getZZ(range(64),range(64))
imshow(N, interpolation="Nearest")
show()
It's not the ultimate solution since the plot is not restricted to points having a z value, planes larger than 64 * 64 are not accounted for and the planes have to be defined at (0,0,0).
For the reduced requirements, I prepared a simple example
import numpy as np
import pylab as plt
data = np.arange((64**3))
data.resize((64,64,64))
def get_slice(volume, orientation, index):
orientation2slicefunc = {
"x" : lambda ar:ar[index,:,:],
"y" : lambda ar:ar[:,index,:],
"z" : lambda ar:ar[:,:,index]
}
return orientation2slicefunc[orientation](volume)
plt.subplot(221)
plt.imshow(get_slice(data, "x", 10), vmin=0, vmax=64**3)
plt.subplot(222)
plt.imshow(get_slice(data, "x", 39), vmin=0, vmax=64**3)
plt.subplot(223)
plt.imshow(get_slice(data, "y", 15), vmin=0, vmax=64**3)
plt.subplot(224)
plt.imshow(get_slice(data, "z", 25), vmin=0, vmax=64**3)
plt.show()
This leads to the following plot:
The main trick is dictionary mapping orienations to lambda-methods, which saves us from writing annoying if-then-else-blocks. Of course you can decide to give different names,
e.g., numbers, for the orientations.
Maybe this helps you.
Thorsten
P.S.: I didn't care about "IndexOutOfRange", for me it's o.k. to let this exception pop out since it is perfectly understandable in this context.
I had to do something similar for a MRI data enhancement:
Probably the code can be optimized but it works as it is.
My data is 3 dimension numpy array representing an MRI scanner. It has size [128,128,128] but the code can be modified to accept any dimensions. Also when the plane is outside the cube boundary you have to give the default values to the variable fill in the main function, in my case I choose: data_cube[0:5,0:5,0:5].mean()
def create_normal_vector(x, y,z):
normal = np.asarray([x,y,z])
normal = normal/np.sqrt(sum(normal**2))
return normal
def get_plane_equation_parameters(normal,point):
a,b,c = normal
d = np.dot(normal,point)
return a,b,c,d #ax+by+cz=d
def get_point_plane_proximity(plane,point):
#just aproximation
return np.dot(plane[0:-1],point) - plane[-1]
def get_corner_interesections(plane, cube_dim = 128): #to reduce the search space
#dimension is 128,128,128
corners_list = []
only_x = np.zeros(4)
min_prox_x = 9999
min_prox_y = 9999
min_prox_z = 9999
min_prox_yz = 9999
for i in range(cube_dim):
temp_min_prox_x=abs(get_point_plane_proximity(plane,np.asarray([i,0,0])))
# print("pseudo distance x: {0}, point: [{1},0,0]".format(temp_min_prox_x,i))
if temp_min_prox_x < min_prox_x:
min_prox_x = temp_min_prox_x
corner_intersection_x = np.asarray([i,0,0])
only_x[0]= i
temp_min_prox_y=abs(get_point_plane_proximity(plane,np.asarray([i,cube_dim,0])))
# print("pseudo distance y: {0}, point: [{1},{2},0]".format(temp_min_prox_y,i,cube_dim))
if temp_min_prox_y < min_prox_y:
min_prox_y = temp_min_prox_y
corner_intersection_y = np.asarray([i,cube_dim,0])
only_x[1]= i
temp_min_prox_z=abs(get_point_plane_proximity(plane,np.asarray([i,0,cube_dim])))
#print("pseudo distance z: {0}, point: [{1},0,{2}]".format(temp_min_prox_z,i,cube_dim))
if temp_min_prox_z < min_prox_z:
min_prox_z = temp_min_prox_z
corner_intersection_z = np.asarray([i,0,cube_dim])
only_x[2]= i
temp_min_prox_yz=abs(get_point_plane_proximity(plane,np.asarray([i,cube_dim,cube_dim])))
#print("pseudo distance z: {0}, point: [{1},{2},{2}]".format(temp_min_prox_yz,i,cube_dim))
if temp_min_prox_yz < min_prox_yz:
min_prox_yz = temp_min_prox_yz
corner_intersection_yz = np.asarray([i,cube_dim,cube_dim])
only_x[3]= i
corners_list.append(corner_intersection_x)
corners_list.append(corner_intersection_y)
corners_list.append(corner_intersection_z)
corners_list.append(corner_intersection_yz)
corners_list.append(only_x.min())
corners_list.append(only_x.max())
return corners_list
def get_points_intersection(plane,min_x,max_x,data_cube,shape=128):
fill = data_cube[0:5,0:5,0:5].mean() #this can be a parameter
extended_data_cube = np.ones([shape+2,shape,shape])*fill
extended_data_cube[1:shape+1,:,:] = data_cube
diag_image = np.zeros([shape,shape])
min_x_value = 999999
for i in range(shape):
for j in range(shape):
for k in range(int(min_x),int(max_x)+1):
current_value = abs(get_point_plane_proximity(plane,np.asarray([k,i,j])))
#print("current_value:{0}, val: [{1},{2},{3}]".format(current_value,k,i,j))
if current_value < min_x_value:
diag_image[i,j] = extended_data_cube[k,i,j]
min_x_value = current_value
min_x_value = 999999
return diag_image
The way it works is the following:
you create a normal vector:
for example [5,0,3]
normal1=create_normal_vector(5, 0,3) #this is only to normalize
then you create a point:
(my cube data shape is [128,128,128])
point = [64,64,64]
You calculate the plane equation parameters, [a,b,c,d] where ax+by+cz=d
plane1=get_plane_equation_parameters(normal1,point)
then to reduce the search space you can calculate the intersection of the plane with the cube:
corners1 = get_corner_interesections(plane1,128)
where corners1 = [intersection [x,0,0],intersection [x,128,0],intersection [x,0,128],intersection [x,128,128], min intersection [x,y,z], max intersection [x,y,z]]
With all these you can calculate the intersection between the cube and the plane:
image1 = get_points_intersection(plane1,corners1[-2],corners1[-1],data_cube)
Some examples:
normal is [1,0,0] point is [64,64,64]
normal is [5,1,0],[5,1,1],[5,0,1] point is [64,64,64]:
normal is [5,3,0],[5,3,3],[5,0,3] point is [64,64,64]:
normal is [5,-5,0],[5,-5,-5],[5,0,-5] point is [64,64,64]:
Thank you.
The other answers here do not appear to be very efficient with explicit loops over pixels or using scipy.interpolate.griddata, which is designed for unstructured input data. Here is an efficient (vectorized) and generic solution.
There is a pure numpy implementation (for nearest-neighbor "interpolation") and one for linear interpolation, which delegates the interpolation to scipy.ndimage.map_coordinates. (The latter function probably didn't exist in 2013, when this question was asked.)
import numpy as np
from scipy.ndimage import map_coordinates
def slice_datacube(cube, center, eXY, mXY, fill=np.nan, interp=True):
"""Get a 2D slice from a 3-D array.
Copyright: Han-Kwang Nienhuys, 2020.
License: any of CC-BY-SA, CC-BY, BSD, GPL, LGPL
Reference: https://stackoverflow.com/a/62733930/6228891
Parameters:
- cube: 3D array, assumed shape (nx, ny, nz).
- center: shape (3,) with coordinates of center.
can be float.
- eXY: unit vectors, shape (2, 3) - for X and Y axes of the slice.
(unit vectors must be orthogonal; normalization is optional).
- mXY: size tuple of output array (mX, mY) - int.
- fill: value to use for out-of-range points.
- interp: whether to interpolate (rather than using 'nearest')
Return:
- slice: array, shape (mX, mY).
"""
center = np.array(center, dtype=float)
assert center.shape == (3,)
eXY = np.array(eXY)/np.linalg.norm(eXY, axis=1)[:, np.newaxis]
if not np.isclose(eXY[0] # eXY[1], 0, atol=1e-6):
raise ValueError(f'eX and eY not orthogonal.')
# R: rotation matrix: data_coords = center + R # slice_coords
eZ = np.cross(eXY[0], eXY[1])
R = np.array([eXY[0], eXY[1], eZ], dtype=np.float32).T
# setup slice points P with coordinates (X, Y, 0)
mX, mY = int(mXY[0]), int(mXY[1])
Xs = np.arange(0.5-mX/2, 0.5+mX/2)
Ys = np.arange(0.5-mY/2, 0.5+mY/2)
PP = np.zeros((3, mX, mY), dtype=np.float32)
PP[0, :, :] = Xs.reshape(mX, 1)
PP[1, :, :] = Ys.reshape(1, mY)
# Transform to data coordinates (x, y, z) - idx.shape == (3, mX, mY)
if interp:
idx = np.einsum('il,ljk->ijk', R, PP) + center.reshape(3, 1, 1)
slice = map_coordinates(cube, idx, order=1, mode='constant', cval=fill)
else:
idx = np.einsum('il,ljk->ijk', R, PP) + (0.5 + center.reshape(3, 1, 1))
idx = idx.astype(np.int16)
# Find out which coordinates are out of range - shape (mX, mY)
badpoints = np.any([
idx[0, :, :] < 0,
idx[0, :, :] >= cube.shape[0],
idx[1, :, :] < 0,
idx[1, :, :] >= cube.shape[1],
idx[2, :, :] < 0,
idx[2, :, :] >= cube.shape[2],
], axis=0)
idx[:, badpoints] = 0
slice = cube[idx[0], idx[1], idx[2]]
slice[badpoints] = fill
return slice
# Demonstration
nx, ny, nz = 50, 70, 100
cube = np.full((nx, ny, nz), np.float32(1))
cube[nx//4:nx*3//4, :, :] += 1
cube[:, ny//2:ny*3//4, :] += 3
cube[:, :, nz//4:nz//2] += 7
cube[nx//3-2:nx//3+2, ny//2-2:ny//2+2, :] = 0 # black dot
Rz, Rx = np.pi/6, np.pi/4 # rotation angles around z and x
cz, sz = np.cos(Rz), np.sin(Rz)
cx, sx = np.cos(Rx), np.sin(Rx)
Rmz = np.array([[cz, -sz, 0], [sz, cz, 0], [0, 0, 1]])
Rmx = np.array([[1, 0, 0], [0, cx, -sx], [0, sx, cx]])
eXY = (Rmx # Rmz).T[:2]
slice = slice_datacube(
cube,
center=[nx/3, ny/2, nz*0.7],
eXY=eXY,
mXY=[80, 90],
fill=np.nan,
interp=False
)
import matplotlib.pyplot as plt
plt.close('all')
plt.imshow(slice.T) # imshow expects shape (mY, mX)
plt.colorbar()
Output (for interp=False):
For this test case (50x70x100 datacube, 80x90 slice size) the run time is 376 µs (interp=False) and 550 µs (interp=True) on my laptop.
Is there a numpy builtin to do something like the following? That is, take a list d and return a list filtered_d with any outlying elements removed based on some assumed distribution of the points in d.
import numpy as np
def reject_outliers(data):
m = 2
u = np.mean(data)
s = np.std(data)
filtered = [e for e in data if (u - 2 * s < e < u + 2 * s)]
return filtered
>>> d = [2,4,5,1,6,5,40]
>>> filtered_d = reject_outliers(d)
>>> print filtered_d
[2,4,5,1,6,5]
I say 'something like' because the function might allow for varying distributions (poisson, gaussian, etc.) and varying outlier thresholds within those distributions (like the m I've used here).
Something important when dealing with outliers is that one should try to use estimators as robust as possible. The mean of a distribution will be biased by outliers but e.g. the median will be much less.
Building on eumiro's answer:
def reject_outliers(data, m = 2.):
d = np.abs(data - np.median(data))
mdev = np.median(d)
s = d/mdev if mdev else np.zero(len(d))
return data[s<m]
Here I have replace the mean with the more robust median and the standard deviation with the median absolute distance to the median. I then scaled the distances by their (again) median value so that m is on a reasonable relative scale.
Note that for the data[s<m] syntax to work, data must be a numpy array.
This method is almost identical to yours, just more numpyst (also working on numpy arrays only):
def reject_outliers(data, m=2):
return data[abs(data - np.mean(data)) < m * np.std(data)]
Benjamin Bannier's answer yields a pass-through when the median of distances from the median is 0, so I found this modified version a bit more helpful for cases as given in the example below.
def reject_outliers_2(data, m=2.):
d = np.abs(data - np.median(data))
mdev = np.median(d)
s = d / (mdev if mdev else 1.)
return data[s < m]
Example:
data_points = np.array([10, 10, 10, 17, 10, 10])
print(reject_outliers(data_points))
print(reject_outliers_2(data_points))
Gives:
[[10, 10, 10, 17, 10, 10]] # 17 is not filtered
[10, 10, 10, 10, 10] # 17 is filtered (it's distance, 7, is greater than m)
Building on Benjamin's, using pandas.Series, and replacing MAD with IQR:
def reject_outliers(sr, iq_range=0.5):
pcnt = (1 - iq_range) / 2
qlow, median, qhigh = sr.dropna().quantile([pcnt, 0.50, 1-pcnt])
iqr = qhigh - qlow
return sr[ (sr - median).abs() <= iqr]
For instance, if you set iq_range=0.6, the percentiles of the interquartile-range would become: 0.20 <--> 0.80, so more outliers will be included.
An alternative is to make a robust estimation of the standard deviation (assuming Gaussian statistics). Looking up online calculators, I see that the 90% percentile corresponds to 1.2815σ and the 95% is 1.645σ (http://vassarstats.net/tabs.html?#z)
As a simple example:
import numpy as np
# Create some random numbers
x = np.random.normal(5, 2, 1000)
# Calculate the statistics
print("Mean= ", np.mean(x))
print("Median= ", np.median(x))
print("Max/Min=", x.max(), " ", x.min())
print("StdDev=", np.std(x))
print("90th Percentile", np.percentile(x, 90))
# Add a few large points
x[10] += 1000
x[20] += 2000
x[30] += 1500
# Recalculate the statistics
print()
print("Mean= ", np.mean(x))
print("Median= ", np.median(x))
print("Max/Min=", x.max(), " ", x.min())
print("StdDev=", np.std(x))
print("90th Percentile", np.percentile(x, 90))
# Measure the percentile intervals and then estimate Standard Deviation of the distribution, both from median to the 90th percentile and from the 10th to 90th percentile
p90 = np.percentile(x, 90)
p10 = np.percentile(x, 10)
p50 = np.median(x)
# p50 to p90 is 1.2815 sigma
rSig = (p90-p50)/1.2815
print("Robust Sigma=", rSig)
rSig = (p90-p10)/(2*1.2815)
print("Robust Sigma=", rSig)
The output I get is:
Mean= 4.99760520022
Median= 4.95395274981
Max/Min= 11.1226494654 -2.15388472011
Sigma= 1.976629928
90th Percentile 7.52065379649
Mean= 9.64760520022
Median= 4.95667658782
Max/Min= 2205.43861943 -2.15388472011
Sigma= 88.6263902244
90th Percentile 7.60646688694
Robust Sigma= 2.06772555531
Robust Sigma= 1.99878292462
Which is close to the expected value of 2.
If we want to remove points above/below 5 standard deviations (with 1000 points we would expect 1 value > 3 standard deviations):
y = x[abs(x - p50) < rSig*5]
# Print the statistics again
print("Mean= ", np.mean(y))
print("Median= ", np.median(y))
print("Max/Min=", y.max(), " ", y.min())
print("StdDev=", np.std(y))
Which gives:
Mean= 4.99755359935
Median= 4.95213030447
Max/Min= 11.1226494654 -2.15388472011
StdDev= 1.97692712883
I have no idea which approach is the more efficent/robust
I wanted to do something similar, except setting the number to NaN rather than removing it from the data, since if you remove it you change the length which can mess up plotting (i.e. if you're only removing outliers from one column in a table, but you need it to remain the same as the other columns so you can plot them against each other).
To do so I used numpy's masking functions:
def reject_outliers(data, m=2):
stdev = np.std(data)
mean = np.mean(data)
maskMin = mean - stdev * m
maskMax = mean + stdev * m
mask = np.ma.masked_outside(data, maskMin, maskMax)
print('Masking values outside of {} and {}'.format(maskMin, maskMax))
return mask
I would like to provide two methods in this answer, solution based on "z score" and solution based on "IQR".
The code provided in this answer works on both single dim numpy array and multiple numpy array.
Let's import some modules firstly.
import collections
import numpy as np
import scipy.stats as stat
from scipy.stats import iqr
z score based method
This method will test if the number falls outside the three standard deviations. Based on this rule, if the value is outlier, the method will return true, if not, return false.
def sd_outlier(x, axis = None, bar = 3, side = 'both'):
assert side in ['gt', 'lt', 'both'], 'Side should be `gt`, `lt` or `both`.'
d_z = stat.zscore(x, axis = axis)
if side == 'gt':
return d_z > bar
elif side == 'lt':
return d_z < -bar
elif side == 'both':
return np.abs(d_z) > bar
IQR based method
This method will test if the value is less than q1 - 1.5 * iqr or greater than q3 + 1.5 * iqr, which is similar to SPSS's plot method.
def q1(x, axis = None):
return np.percentile(x, 25, axis = axis)
def q3(x, axis = None):
return np.percentile(x, 75, axis = axis)
def iqr_outlier(x, axis = None, bar = 1.5, side = 'both'):
assert side in ['gt', 'lt', 'both'], 'Side should be `gt`, `lt` or `both`.'
d_iqr = iqr(x, axis = axis)
d_q1 = q1(x, axis = axis)
d_q3 = q3(x, axis = axis)
iqr_distance = np.multiply(d_iqr, bar)
stat_shape = list(x.shape)
if isinstance(axis, collections.Iterable):
for single_axis in axis:
stat_shape[single_axis] = 1
else:
stat_shape[axis] = 1
if side in ['gt', 'both']:
upper_range = d_q3 + iqr_distance
upper_outlier = np.greater(x - upper_range.reshape(stat_shape), 0)
if side in ['lt', 'both']:
lower_range = d_q1 - iqr_distance
lower_outlier = np.less(x - lower_range.reshape(stat_shape), 0)
if side == 'gt':
return upper_outlier
if side == 'lt':
return lower_outlier
if side == 'both':
return np.logical_or(upper_outlier, lower_outlier)
Finally, if you want to filter out the outliers, use a numpy selector.
Have a nice day.
Consider that all the above methods fail when your standard deviation gets very large due to huge outliers.
(Simalar as the average caluclation fails and should rather caluclate the median. Though, the average is "more prone to such an error as the stdDv".)
You could try to iteratively apply your algorithm or you filter using the interquartile range:
(here "factor" relates to a n*sigma range, yet only when your data follows a Gaussian distribution)
import numpy as np
def sortoutOutliers(dataIn,factor):
quant3, quant1 = np.percentile(dataIn, [75 ,25])
iqr = quant3 - quant1
iqrSigma = iqr/1.34896
medData = np.median(dataIn)
dataOut = [ x for x in dataIn if ( (x > medData - factor* iqrSigma) and (x < medData + factor* iqrSigma) ) ]
return(dataOut)
So many answers, but I'm adding a new one that can be useful for the author or even for other users.
You could use the Hampel filter. But you need to work with Series.
Hampel filter returns the Outliers indices, then you can delete them from the Series, and then convert it back to a List.
To use Hampel filter, you can easily install the package with pip:
pip install hampel
Usage:
# Imports
from hampel import hampel
import pandas as pd
list_d = [2, 4, 5, 1, 6, 5, 40]
# List to Series
time_series = pd.Series(list_d)
# Outlier detection with Hampel filter
# Returns the Outlier indices
outlier_indices = hampel(ts = time_series, window_size = 3)
# Drop Outliers indices from Series
filtered_d = time_series.drop(outlier_indices)
filtered_d.values.tolist()
print(f'filtered_d: {filtered_d.values.tolist()}')
And the output will be:
filtered_d: [2, 4, 5, 1, 6, 5]
Where, ts is a pandas Series object and window_size is a total window size will be computed as 2 * window_size + 1.
For this Series I set window_size with the value 3.
The cool thing about working with Series is being able to generate graphics:
# Imports
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
# Plot Original Series
time_series.plot(style = 'k-')
plt.title('Original Series')
plt.show()
# Plot Cleaned Series
filtered_d.plot(style = 'k-')
plt.title('Cleaned Series (Without detected Outliers)')
plt.show()
And the output will be:
To learn more about Hampel filter, I recommend the following readings:
Python implementation of the Hampel Filter
Outlier Detection with Hampel Filter
Clean-up your time series data with a Hampel Filter
if you want to get the index position of the outliers idx_list will return it.
def reject_outliers(data, m = 2.):
d = np.abs(data - np.median(data))
mdev = np.median(d)
s = d/mdev if mdev else 0.
data_range = np.arange(len(data))
idx_list = data_range[s>=m]
return data[s<m], idx_list
data_points = np.array([8, 10, 35, 17, 73, 77])
print(reject_outliers(data_points))
after rejection: [ 8 10 35 17], index positions of outliers: [4 5]
For a set of images (each image has 3 dimensions), where I wanted to reject outliers for each pixel I used:
mean = np.mean(imgs, axis=0)
std = np.std(imgs, axis=0)
mask = np.greater(0.5 * std + 1, np.abs(imgs - mean))
masked = np.multiply(imgs, mask)
Then it is possible to compute the mean:
masked_mean = np.divide(np.sum(masked, axis=0), np.sum(mask, axis=0))
(I use it for Background Subtraction)
Here I find the outliers in x and substitute them with the median of a window of points (win) around them (taking from Benjamin Bannier answer the median deviation)
def outlier_smoother(x, m=3, win=3, plots=False):
''' finds outliers in x, points > m*mdev(x) [mdev:median deviation]
and replaces them with the median of win points around them '''
x_corr = np.copy(x)
d = np.abs(x - np.median(x))
mdev = np.median(d)
idxs_outliers = np.nonzero(d > m*mdev)[0]
for i in idxs_outliers:
if i-win < 0:
x_corr[i] = np.median(np.append(x[0:i], x[i+1:i+win+1]))
elif i+win+1 > len(x):
x_corr[i] = np.median(np.append(x[i-win:i], x[i+1:len(x)]))
else:
x_corr[i] = np.median(np.append(x[i-win:i], x[i+1:i+win+1]))
if plots:
plt.figure('outlier_smoother', clear=True)
plt.plot(x, label='orig.', lw=5)
plt.plot(idxs_outliers, x[idxs_outliers], 'ro', label='outliers')
plt.plot(x_corr, '-o', label='corrected')
plt.legend()
return x_corr
Trim outliers in a numpy array along axis and replace them with min or max values along this axis, whichever is closer. The threshold is z-score:
def np_z_trim(x, threshold=10, axis=0):
""" Replace outliers in numpy ndarray along axis with min or max values
within the threshold along this axis, whichever is closer."""
mean = np.mean(x, axis=axis, keepdims=True)
std = np.std(x, axis=axis, keepdims=True)
masked = np.where(np.abs(x - mean) < threshold * std, x, np.nan)
min = np.nanmin(masked, axis=axis, keepdims=True)
max = np.nanmax(masked, axis=axis, keepdims=True)
repl = np.where(np.abs(x - max) < np.abs(x - min), max, min)
return np.where(np.isnan(masked), repl, masked)
My solution drops the top and bottom percentiles, keeping values that are equal to the boundary:
def remove_percentile_outliers(data, percent_to_drop=0.001):
low, high = data.quantile([percent_to_drop / 2, 1-percent_to_drop / 2])
return data[(data >= low )&(data <= high)]
My solution let the outliers equal to the previous value.
test_data = [2,4,5,1,6,5,40, 3]
def reject_outliers(data, m=2):
mean = np.mean(data)
std = np.std(data)
for i in range(len(data)) :
if np.abs(data[i] -mean) > m*std :
data[i] = data[i-1]
return data
reject_outliers(test_data)
Output:
[2, 4, 5, 1, 6, 5, 5, 3]