I am writing code to remove plateau outliers from time series data. I proceeded after receiving advice to use np.diff, but there was a problem that it could not be recognized if it was not the same value.
def find_plateaus(F, min_length=200, tolerance = 0.75, smoothing=15):
import numpy as np
from scipy.ndimage.filters import uniform_filter1d
# calculate smooth gradients
smoothF = uniform_filter1d(F, size = smoothing)
dF = uniform_filter1d(np.gradient(smoothF),size = smoothing)
d2F = uniform_filter1d(np.gradient(dF),size = smoothing)
def zero_runs(x):
iszero = np.concatenate(([0], np.equal(x, 0).view(np.int8), [0]))
absdiff = np.abs(np.diff(iszero))
ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
return ranges
# Find ranges where second derivative is zero
# Values under eps are assumed to be zero.
eps = np.quantile(abs(d2F),tolerance)
smalld2F = (abs(d2F) <= eps)
# Find repititions in the mask "smalld2F" (i.e. ranges where d2F is constantly zero)
p = zero_runs(np.diff(smalld2F))
# np.diff(p) gives the length of each range found.
# only accept plateaus of min_length
plateaus = p[(np.diff(p) > min_length).flatten()]
return (plateaus)
plateaus = find_plateaus(test, min_length=5, tolerance = 0.02, smoothing=11)
plateaus = np.ravel(plateaus, order = 'A')
plateaus = plateaus.tolist()
print(plateaus)
test2['T&F'] = np.nan
for i in test2.index:
if i in plateaus:
test2.loc[i,['T&F']] = test2.loc[i,'data']
else :
test2.loc[i,['T&F']] = 0
fig, ax = plt.subplots(figsize=(15,6))
ax.plot(test2.index, test2['data'], color='black', label = 'time_series')
ax.scatter(test2.index,test2['T&F'], color='red', label = 'D910')
plt.legend()
plt.show();
Do you know any libraries or methods that can be used?
I want to recognize the parts marked in the picture below.
enter image description here
Still in progress, but found the answer.
First, make the np array multidimensional.
ex) time_step = 3
.....
Then, using np.std(), find the standard deviation,
After checking, you can set the standard deviation range to recognize the included range.
I wrote this function to compute the normalized percentage correlation between two filter functions (with one shifted). The function works but takes about 8 to 12 minutes depending on the number of elements in nbs. I would like to know if there is another way to make this operation faster. Here is my code below:
import numpy as np
DT = 0.08
def corr_g(*nbs, Np=10000, sf = 0.5):
wb = 0.25 # bandwidth in Hz
freq = (1/DT)*np.linspace(-0.5,0.5-1/Np,Np) # frequency vector
dCg_norms = np.zeros((Np,len(nbs)))
for idx, nb in enumerate(nbs): # nb is the filter parameter
d_k_vector = np.linspace(-Np*sf, Np*sf, Np) # indices vector
dCg = d_k_vector*0 # array to hold correlation
g = ((1+np.exp(-nb))**2)/((1+np.exp(-nb*(freq+wb)/wb))*(1+np.exp(nb*(freq-wb)/wb))) # filter function
for index2, d_k in enumerate(d_k_vector): # loop through the new indices vector
for index, sth in enumerate(g):
# form a new array from g using the indices vector use only values within the limits of g. Then do a dot product operation
if (index+d_k) < Np and (index+d_k) >= 0:
dCg[index2] += g[index] * g[index+int(d_k)]
dCg_norm = dCg/np.max(dCg)*100 # normalized correlation
dCg_norms[:,idx] = dCg_norm # add to allocated array
return dCg_norms
my_arr = corr_g(*[2,4,8,16])
import matplotlib.pyplot as plt
Np = 10000
DT = 0.08
d_k_vector = np.linspace(-5000, 5000, Np)
plt.plot(d_k_vector/(10000*DT)/0.25,my_arr[:,1])
You should not calculate correlation yourself, better use np.correlate(vector, 'same'). There are small differences between your result and mine and I am pretty sure error is on your side.
def corr_g2(*nbs, Np=10000, sf = 0.5):
wb = 0.25 # bandwidth in Hz
freq = (1/DT)*np.linspace(-0.5,0.5-1/Np,Np) # frequency vector
dCg_norms = np.zeros((Np,len(nbs)))
for idx, nb in enumerate(nbs): # nb is the filter parameter
g = ((1+np.exp(-nb))**2)/((1+np.exp(-nb*(freq+wb)/wb))*(1+np.exp(nb*(freq-wb)/wb))) # filter function
dCg = np.correlate(g, g, 'same')
dCg_norm = dCg/np.max(dCg)*100 # normalized correlation
dCg_norms[:,idx] = dCg_norm # add to allocated array
return dCg_norms
def main():
my_arr = corr_g(*[2,4], Np=Np)
my_arr2 = corr_g2(*[2,4], Np=Np)
# import matplotlib.pyplot as plt
# d_k_vector = np.linspace(-Np / 2, Np / 2 - 1, Np)
# plt.plot(d_k_vector/(10000*DT)/0.25,my_arr[:,1])
# plt.plot(d_k_vector/(10000*DT)/0.25,my_arr2[:,1])
# plt.show()
if __name__ == '__main__':
main()
Profiling results for Np=1000:
Line # Hits Time Per Hit % Time Line Contents
==============================================================
39 #do_profile()
40 def main():
41 1 14419637.0 14419637.0 100.0 my_arr = corr_g(*[2,4], Np=Np)
42 1 1598.0 1598.0 0.0 my_arr2 = corr_g2(*[2,4], Np=Np)
I'm working on instationary experimental data from fluid dynamics. We have measured data on three channels, so the samples are not directly coincident (measured at the same time). I want to filter them with a window scheme to get coincident samples and disgard all others.
Unfortunately, I cannot upload the original data set due to restrictions of the company. But I tried to set up a minimal example, which generates a similiar (smaller) dataset. The original dataset consists of 500000 values per channel, each noted with an arrival time. The coincidence is checked with these time stamps.
Just now, I loop over each sample from the first channel and look at the time differences to the other channels. If it is smaller than the specified window width, the index is saved. Probably it would be a little bit faster if I specifiy an intervall in which to check for the differences (like 100 or 1000 samples in the neighborhood). But the datarate between the channels can differ significantly, so it is not implemented yet. I prefer to get rid of looping over each sample - if possible.
def filterCoincidence(df, window = 50e-6):
'''
Filters the dataset with arbitrary different data rates on different channels to coincident samples.
The coincidence is checked with regard to a time window specified as argument.
'''
AT_cols = [col for col in df.columns if 'AT' in col]
if len(AT_cols) == 1:
print('only one group available')
return
used_ix = np.zeros( (df.shape[0], len(AT_cols)))
used_ix.fill(np.nan)
for ix, sample in enumerate(df[AT_cols[0]]):
used_ix[ix, 0] = ix
test_ix = np.zeros(2)
for ii, AT_col in enumerate(AT_cols[1:]):
diff = np.abs(df[AT_col] - sample)
index = diff[diff <= window].sort_values().index.values
if len(index) == 0:
test_ix[ii] = None
continue
test_ix[ii] = [ix_use if (ix_use not in used_ix[:, ii+1] or ix == 0) else None for ix_use in index][0]
if not np.any(np.isnan(test_ix)):
used_ix[ix, 1:] = test_ix
else:
used_ix[ix, 1:] = [None, None]
used_ix = used_ix[~np.isnan(used_ix).any(axis=1)]
print(used_ix.shape)
return
no_points = 10000
no_groups = 3
meas_duration = 60
df = pd.DataFrame(np.transpose([np.sort(np.random.rand(no_points)*meas_duration) for _ in range(no_groups)]), columns=['AT {}'.format(i) for i in range(no_groups)])
filterCoincidence(df, window=1e-3)
Is there a module already implemented, which can do this sort of filtering? However, it would be awesome if you can give me some hints to increase the performance of the code.
Just to update this thread if somebody else have a similar problem. I think after several code revisions, I have found a proper solution to this.
def filterCoincidence(self, AT1, AT2, AT3, window = 0.05e-3):
'''
Filters the dataset with arbitrary different data rates on different channels to coincident samples.
The coincidence is checked with regard to a time window specified as argument.
- arguments:
- three times series AT1, AT2 and AT3 (arrival times of particles in my case)
- window size (50 microseconds as default setting)
- output: indices of combined samples
'''
start_time = datetime.datetime.now()
AT_list = [AT1, AT2, AT3]
# take the shortest period of time
min_EndArrival = np.max(AT_list)
max_BeginArrival = np.min(AT_list)
for i, col in enumerate(AT_list):
min_EndArrival = min(min_EndArrival, np.max(col))
max_BeginArrival = max(max_BeginArrival, np.min(col))
for i, col in enumerate(AT_list):
AT_list[i] = np.delete(AT_list[i], np.where((col < max_BeginArrival - window) | (col > min_EndArrival + window)))
# get channel with lowest datarate
num_points = np.zeros(len(AT_list))
datarate = np.zeros(len(AT_list))
for i, AT in enumerate(AT_list):
num_points[i] = AT.shape[0]
datarate[i] = num_points[i] / (AT[-1]-AT[0])
used_ref = np.argmin(datarate)
# process coincidence
AT_ref_val = AT_list[used_ref]
AT_list = list(np.delete(AT_list, used_ref))
overview = np.zeros( (AT_ref_val.shape[0], 3), dtype=int)
overview[:,0] = np.arange(AT_ref_val.shape[0], dtype=int)
borders = np.empty(2, dtype=object)
max_diff = np.zeros(2, dtype=int)
for i, AT in enumerate(AT_list):
neighbors_lower = np.searchsorted(AT, AT_ref_val - window, side='left')
neighbors_upper = np.searchsorted(AT, AT_ref_val + window, side='left')
borders[i] = np.transpose([neighbors_lower, neighbors_upper])
coinc_ix = np.where(np.diff(borders[i], axis=1).flatten() != 0)[0]
max_diff[i] = np.max(np.diff(borders[i], axis=1))
overview[coinc_ix, i+1] = 1
use_ix = np.where(~np.any(overview==0, axis=1))
borders[0] = borders[0][use_ix]
borders[1] = borders[1][use_ix]
overview = overview[use_ix]
# create all possible combinations refer to the reference
combinations = np.prod(max_diff)
test = np.empty((overview.shape[0]*combinations, 3), dtype=object)
for i, [ref_ix, at1, at2] in enumerate(zip(overview[:, 0], borders[0], borders[1])):
test[i * combinations:i * combinations + combinations, 0] = ref_ix
at1 = np.arange(at1[0], at1[1])
at2 = np.arange(at2[0], at2[1])
test[i*combinations:i*combinations+at1.shape[0]*at2.shape[0],1:] = np.asarray(list(itertools.product(at1, at2)))
test = test[~np.any(pd.isnull(test), axis=1)]
# check distances
ix_ref = test[:,0]
test = test[:,1:]
test = np.insert(test, used_ref, ix_ref, axis=1)
test = test.astype(int)
AT_list.insert(used_ref, AT_ref_val)
AT_mat = np.zeros(test.shape)
for i, AT in enumerate(AT_list):
AT_mat[:,i] = AT[test[:,i]]
distances = np.zeros( (test.shape[0], len(list(itertools.combinations(range(3), 2)))))
for i, AT in enumerate(itertools.combinations(range(3), 2)):
distances[:,i] = np.abs(AT_mat[:,AT[0]]-AT_mat[:,AT[1]])
ix = np.where(np.all(distances <= window, axis=1))[0]
test = test[ix,:]
distances = distances[ix,:]
# check duplicates
# use sum of differences as similarity factor
dist_sum = np.max(distances, axis=1)
unique_sorted = np.argsort([np.unique(test[:,i]).shape[0] for i in range(test.shape[1])])[::-1]
test = np.hstack([test, dist_sum.reshape(-1, 1)])
test = test[test[:,-1].argsort()]
for j in unique_sorted:
_, ix = np.unique(test[:,j], return_index=True)
test = test[ix, :]
test = test[:,:3]
test = test[test[:,used_ref].argsort()]
# check that all values are after each other
ix = np.where(np.any(np.diff(test, axis=0) > 0, axis=1))[0]
ix = np.append(ix, test.shape[0]-1)
test = test[ix,:]
print('{} coincident samples obtained in {}.'.format(test.shape[0], datetime.datetime.now()-start_time))
return test
I'm certain that there is a better solution, but for me it works now. And I know, the variable names should definitely be chosen with more clarity (e.g. test), but I will clean up my code at the end of my master thesis... perhaps :-)
Is there a numpy builtin to do something like the following? That is, take a list d and return a list filtered_d with any outlying elements removed based on some assumed distribution of the points in d.
import numpy as np
def reject_outliers(data):
m = 2
u = np.mean(data)
s = np.std(data)
filtered = [e for e in data if (u - 2 * s < e < u + 2 * s)]
return filtered
>>> d = [2,4,5,1,6,5,40]
>>> filtered_d = reject_outliers(d)
>>> print filtered_d
[2,4,5,1,6,5]
I say 'something like' because the function might allow for varying distributions (poisson, gaussian, etc.) and varying outlier thresholds within those distributions (like the m I've used here).
Something important when dealing with outliers is that one should try to use estimators as robust as possible. The mean of a distribution will be biased by outliers but e.g. the median will be much less.
Building on eumiro's answer:
def reject_outliers(data, m = 2.):
d = np.abs(data - np.median(data))
mdev = np.median(d)
s = d/mdev if mdev else np.zero(len(d))
return data[s<m]
Here I have replace the mean with the more robust median and the standard deviation with the median absolute distance to the median. I then scaled the distances by their (again) median value so that m is on a reasonable relative scale.
Note that for the data[s<m] syntax to work, data must be a numpy array.
This method is almost identical to yours, just more numpyst (also working on numpy arrays only):
def reject_outliers(data, m=2):
return data[abs(data - np.mean(data)) < m * np.std(data)]
Benjamin Bannier's answer yields a pass-through when the median of distances from the median is 0, so I found this modified version a bit more helpful for cases as given in the example below.
def reject_outliers_2(data, m=2.):
d = np.abs(data - np.median(data))
mdev = np.median(d)
s = d / (mdev if mdev else 1.)
return data[s < m]
Example:
data_points = np.array([10, 10, 10, 17, 10, 10])
print(reject_outliers(data_points))
print(reject_outliers_2(data_points))
Gives:
[[10, 10, 10, 17, 10, 10]] # 17 is not filtered
[10, 10, 10, 10, 10] # 17 is filtered (it's distance, 7, is greater than m)
Building on Benjamin's, using pandas.Series, and replacing MAD with IQR:
def reject_outliers(sr, iq_range=0.5):
pcnt = (1 - iq_range) / 2
qlow, median, qhigh = sr.dropna().quantile([pcnt, 0.50, 1-pcnt])
iqr = qhigh - qlow
return sr[ (sr - median).abs() <= iqr]
For instance, if you set iq_range=0.6, the percentiles of the interquartile-range would become: 0.20 <--> 0.80, so more outliers will be included.
An alternative is to make a robust estimation of the standard deviation (assuming Gaussian statistics). Looking up online calculators, I see that the 90% percentile corresponds to 1.2815σ and the 95% is 1.645σ (http://vassarstats.net/tabs.html?#z)
As a simple example:
import numpy as np
# Create some random numbers
x = np.random.normal(5, 2, 1000)
# Calculate the statistics
print("Mean= ", np.mean(x))
print("Median= ", np.median(x))
print("Max/Min=", x.max(), " ", x.min())
print("StdDev=", np.std(x))
print("90th Percentile", np.percentile(x, 90))
# Add a few large points
x[10] += 1000
x[20] += 2000
x[30] += 1500
# Recalculate the statistics
print()
print("Mean= ", np.mean(x))
print("Median= ", np.median(x))
print("Max/Min=", x.max(), " ", x.min())
print("StdDev=", np.std(x))
print("90th Percentile", np.percentile(x, 90))
# Measure the percentile intervals and then estimate Standard Deviation of the distribution, both from median to the 90th percentile and from the 10th to 90th percentile
p90 = np.percentile(x, 90)
p10 = np.percentile(x, 10)
p50 = np.median(x)
# p50 to p90 is 1.2815 sigma
rSig = (p90-p50)/1.2815
print("Robust Sigma=", rSig)
rSig = (p90-p10)/(2*1.2815)
print("Robust Sigma=", rSig)
The output I get is:
Mean= 4.99760520022
Median= 4.95395274981
Max/Min= 11.1226494654 -2.15388472011
Sigma= 1.976629928
90th Percentile 7.52065379649
Mean= 9.64760520022
Median= 4.95667658782
Max/Min= 2205.43861943 -2.15388472011
Sigma= 88.6263902244
90th Percentile 7.60646688694
Robust Sigma= 2.06772555531
Robust Sigma= 1.99878292462
Which is close to the expected value of 2.
If we want to remove points above/below 5 standard deviations (with 1000 points we would expect 1 value > 3 standard deviations):
y = x[abs(x - p50) < rSig*5]
# Print the statistics again
print("Mean= ", np.mean(y))
print("Median= ", np.median(y))
print("Max/Min=", y.max(), " ", y.min())
print("StdDev=", np.std(y))
Which gives:
Mean= 4.99755359935
Median= 4.95213030447
Max/Min= 11.1226494654 -2.15388472011
StdDev= 1.97692712883
I have no idea which approach is the more efficent/robust
I wanted to do something similar, except setting the number to NaN rather than removing it from the data, since if you remove it you change the length which can mess up plotting (i.e. if you're only removing outliers from one column in a table, but you need it to remain the same as the other columns so you can plot them against each other).
To do so I used numpy's masking functions:
def reject_outliers(data, m=2):
stdev = np.std(data)
mean = np.mean(data)
maskMin = mean - stdev * m
maskMax = mean + stdev * m
mask = np.ma.masked_outside(data, maskMin, maskMax)
print('Masking values outside of {} and {}'.format(maskMin, maskMax))
return mask
I would like to provide two methods in this answer, solution based on "z score" and solution based on "IQR".
The code provided in this answer works on both single dim numpy array and multiple numpy array.
Let's import some modules firstly.
import collections
import numpy as np
import scipy.stats as stat
from scipy.stats import iqr
z score based method
This method will test if the number falls outside the three standard deviations. Based on this rule, if the value is outlier, the method will return true, if not, return false.
def sd_outlier(x, axis = None, bar = 3, side = 'both'):
assert side in ['gt', 'lt', 'both'], 'Side should be `gt`, `lt` or `both`.'
d_z = stat.zscore(x, axis = axis)
if side == 'gt':
return d_z > bar
elif side == 'lt':
return d_z < -bar
elif side == 'both':
return np.abs(d_z) > bar
IQR based method
This method will test if the value is less than q1 - 1.5 * iqr or greater than q3 + 1.5 * iqr, which is similar to SPSS's plot method.
def q1(x, axis = None):
return np.percentile(x, 25, axis = axis)
def q3(x, axis = None):
return np.percentile(x, 75, axis = axis)
def iqr_outlier(x, axis = None, bar = 1.5, side = 'both'):
assert side in ['gt', 'lt', 'both'], 'Side should be `gt`, `lt` or `both`.'
d_iqr = iqr(x, axis = axis)
d_q1 = q1(x, axis = axis)
d_q3 = q3(x, axis = axis)
iqr_distance = np.multiply(d_iqr, bar)
stat_shape = list(x.shape)
if isinstance(axis, collections.Iterable):
for single_axis in axis:
stat_shape[single_axis] = 1
else:
stat_shape[axis] = 1
if side in ['gt', 'both']:
upper_range = d_q3 + iqr_distance
upper_outlier = np.greater(x - upper_range.reshape(stat_shape), 0)
if side in ['lt', 'both']:
lower_range = d_q1 - iqr_distance
lower_outlier = np.less(x - lower_range.reshape(stat_shape), 0)
if side == 'gt':
return upper_outlier
if side == 'lt':
return lower_outlier
if side == 'both':
return np.logical_or(upper_outlier, lower_outlier)
Finally, if you want to filter out the outliers, use a numpy selector.
Have a nice day.
Consider that all the above methods fail when your standard deviation gets very large due to huge outliers.
(Simalar as the average caluclation fails and should rather caluclate the median. Though, the average is "more prone to such an error as the stdDv".)
You could try to iteratively apply your algorithm or you filter using the interquartile range:
(here "factor" relates to a n*sigma range, yet only when your data follows a Gaussian distribution)
import numpy as np
def sortoutOutliers(dataIn,factor):
quant3, quant1 = np.percentile(dataIn, [75 ,25])
iqr = quant3 - quant1
iqrSigma = iqr/1.34896
medData = np.median(dataIn)
dataOut = [ x for x in dataIn if ( (x > medData - factor* iqrSigma) and (x < medData + factor* iqrSigma) ) ]
return(dataOut)
So many answers, but I'm adding a new one that can be useful for the author or even for other users.
You could use the Hampel filter. But you need to work with Series.
Hampel filter returns the Outliers indices, then you can delete them from the Series, and then convert it back to a List.
To use Hampel filter, you can easily install the package with pip:
pip install hampel
Usage:
# Imports
from hampel import hampel
import pandas as pd
list_d = [2, 4, 5, 1, 6, 5, 40]
# List to Series
time_series = pd.Series(list_d)
# Outlier detection with Hampel filter
# Returns the Outlier indices
outlier_indices = hampel(ts = time_series, window_size = 3)
# Drop Outliers indices from Series
filtered_d = time_series.drop(outlier_indices)
filtered_d.values.tolist()
print(f'filtered_d: {filtered_d.values.tolist()}')
And the output will be:
filtered_d: [2, 4, 5, 1, 6, 5]
Where, ts is a pandas Series object and window_size is a total window size will be computed as 2 * window_size + 1.
For this Series I set window_size with the value 3.
The cool thing about working with Series is being able to generate graphics:
# Imports
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
# Plot Original Series
time_series.plot(style = 'k-')
plt.title('Original Series')
plt.show()
# Plot Cleaned Series
filtered_d.plot(style = 'k-')
plt.title('Cleaned Series (Without detected Outliers)')
plt.show()
And the output will be:
To learn more about Hampel filter, I recommend the following readings:
Python implementation of the Hampel Filter
Outlier Detection with Hampel Filter
Clean-up your time series data with a Hampel Filter
if you want to get the index position of the outliers idx_list will return it.
def reject_outliers(data, m = 2.):
d = np.abs(data - np.median(data))
mdev = np.median(d)
s = d/mdev if mdev else 0.
data_range = np.arange(len(data))
idx_list = data_range[s>=m]
return data[s<m], idx_list
data_points = np.array([8, 10, 35, 17, 73, 77])
print(reject_outliers(data_points))
after rejection: [ 8 10 35 17], index positions of outliers: [4 5]
For a set of images (each image has 3 dimensions), where I wanted to reject outliers for each pixel I used:
mean = np.mean(imgs, axis=0)
std = np.std(imgs, axis=0)
mask = np.greater(0.5 * std + 1, np.abs(imgs - mean))
masked = np.multiply(imgs, mask)
Then it is possible to compute the mean:
masked_mean = np.divide(np.sum(masked, axis=0), np.sum(mask, axis=0))
(I use it for Background Subtraction)
Here I find the outliers in x and substitute them with the median of a window of points (win) around them (taking from Benjamin Bannier answer the median deviation)
def outlier_smoother(x, m=3, win=3, plots=False):
''' finds outliers in x, points > m*mdev(x) [mdev:median deviation]
and replaces them with the median of win points around them '''
x_corr = np.copy(x)
d = np.abs(x - np.median(x))
mdev = np.median(d)
idxs_outliers = np.nonzero(d > m*mdev)[0]
for i in idxs_outliers:
if i-win < 0:
x_corr[i] = np.median(np.append(x[0:i], x[i+1:i+win+1]))
elif i+win+1 > len(x):
x_corr[i] = np.median(np.append(x[i-win:i], x[i+1:len(x)]))
else:
x_corr[i] = np.median(np.append(x[i-win:i], x[i+1:i+win+1]))
if plots:
plt.figure('outlier_smoother', clear=True)
plt.plot(x, label='orig.', lw=5)
plt.plot(idxs_outliers, x[idxs_outliers], 'ro', label='outliers')
plt.plot(x_corr, '-o', label='corrected')
plt.legend()
return x_corr
Trim outliers in a numpy array along axis and replace them with min or max values along this axis, whichever is closer. The threshold is z-score:
def np_z_trim(x, threshold=10, axis=0):
""" Replace outliers in numpy ndarray along axis with min or max values
within the threshold along this axis, whichever is closer."""
mean = np.mean(x, axis=axis, keepdims=True)
std = np.std(x, axis=axis, keepdims=True)
masked = np.where(np.abs(x - mean) < threshold * std, x, np.nan)
min = np.nanmin(masked, axis=axis, keepdims=True)
max = np.nanmax(masked, axis=axis, keepdims=True)
repl = np.where(np.abs(x - max) < np.abs(x - min), max, min)
return np.where(np.isnan(masked), repl, masked)
My solution drops the top and bottom percentiles, keeping values that are equal to the boundary:
def remove_percentile_outliers(data, percent_to_drop=0.001):
low, high = data.quantile([percent_to_drop / 2, 1-percent_to_drop / 2])
return data[(data >= low )&(data <= high)]
My solution let the outliers equal to the previous value.
test_data = [2,4,5,1,6,5,40, 3]
def reject_outliers(data, m=2):
mean = np.mean(data)
std = np.std(data)
for i in range(len(data)) :
if np.abs(data[i] -mean) > m*std :
data[i] = data[i-1]
return data
reject_outliers(test_data)
Output:
[2, 4, 5, 1, 6, 5, 5, 3]