How to generate a random integer as with np.random.randint(), but with a normal distribution around 0.
np.random.randint(-10, 10) returns integers with a discrete uniform distribution
np.random.normal(0, 0.1, 1) returns floats with a normal distribution
What I want is a kind of combination between the two functions.
One other way to get a discrete distribution that looks like the normal distribution is to draw from a multinomial distribution where the probabilities are calculated from a normal distribution.
import scipy.stats as ss
import numpy as np
import matplotlib.pyplot as plt
x = np.arange(-10, 11)
xU, xL = x + 0.5, x - 0.5
prob = ss.norm.cdf(xU, scale = 3) - ss.norm.cdf(xL, scale = 3)
prob = prob / prob.sum() # normalize the probabilities so their sum is 1
nums = np.random.choice(x, size = 10000, p = prob)
plt.hist(nums, bins = len(x))
Here, np.random.choice picks an integer from [-10, 10]. The probability for selecting an element, say 0, is calculated by p(-0.5 < x < 0.5) where x is a normal random variable with mean zero and standard deviation 3. I chose a std. dev. of 3 because this way p(-10 < x < 10) is almost 1.
The result looks like this:
It may be possible to generate a similar distribution from a Truncated Normal Distribution that is rounded up to integers. Here's an example with scipy's truncnorm().
import numpy as np
from scipy.stats import truncnorm
import matplotlib.pyplot as plt
scale = 3.
range = 10
size = 100000
X = truncnorm(a=-range/scale, b=+range/scale, scale=scale).rvs(size=size)
X = X.round().astype(int)
Let's see what it looks like
bins = 2 * range + 1
plt.hist(X, bins)
The accepted answer here works, but I tried Will Vousden's solution and it works well too:
import numpy as np
# Generate Distribution:
randomNums = np.random.normal(scale=3, size=100000)
randomInts = np.round(randomNums)
# Plot:
axis = np.arange(start=min(randomInts), stop = max(randomInts) + 1)
plt.hist(randomInts, bins = axis)
Old question, new answer:
For a bell-shaped distribution on the integers {-10, -9, ..., 9, 10}, you can use the binomial distribution with n=20 and p=0.5, and subtract 10 from the samples.
For example,
In [167]: import numpy as np
In [168]: import matplotlib.pyplot as plt
In [169]: rng = np.random.default_rng()
In [170]: N = 5000000 # Number of samples to generate
In [171]: samples = rng.binomial(n=20, p=0.5, size=N) - 10
In [172]: samples.min(), samples.max()
Out[172]: (-10, 10)
Note that the probability of -10 or 10 is pretty low, so you won't necessarily see them in any given sample, especially if you use a smaller N.
np.bincount() is an efficient way to generate a histogram for a collection of small nonnegative integers:
In [173]: counts = np.bincount(samples + 10, minlength=20)
In [174]: counts
Out[174]:
array([ 4, 104, 889, 5517, 22861, 73805, 184473, 369441,
599945, 800265, 881140, 801904, 600813, 370368, 185082, 73635,
23325, 5399, 931, 95, 4])
In [175]: plt.bar(np.arange(-10, 11), counts)
Out[175]: <BarContainer object of 21 artists>
This version is mathematically not correct (because you crop the bell) but will do the job quick and easily understandable if preciseness is not needed that much:
def draw_random_normal_int(low:int, high:int):
# generate a random normal number (float)
normal = np.random.normal(loc=0, scale=1, size=1)
# clip to -3, 3 (where the bell with mean 0 and std 1 is very close to zero
normal = -3 if normal < -3 else normal
normal = 3 if normal > 3 else normal
# scale range of 6 (-3..3) to range of low-high
scaling_factor = (high-low) / 6
normal_scaled = normal * scaling_factor
# center around mean of range of low high
normal_scaled += low + (high-low)/2
# then round and return
return np.round(normal_scaled)
Drawing 100000 numbers results in this histogramm:
Here we start by getting values from the bell curve.
CODE:
#--------*---------*---------*---------*---------*---------*---------*---------*
# Desc: Discretize a normal distribution centered at 0
#--------*---------*---------*---------*---------*---------*---------*---------*
import sys
import random
from math import sqrt, pi
import numpy as np
import matplotlib.pyplot as plt
def gaussian(x, var):
k1 = np.power(x, 2)
k2 = -k1/(2*var)
return (1./(sqrt(2. * pi * var))) * np.exp(k2)
#--------*---------*---------*---------*---------*---------*---------*---------#
while 1:# M A I N L I N E #
#--------*---------*---------*---------*---------*---------*---------*---------#
# # probability density function
# # for discrete normal RV
pdf_DGV = []
pdf_DGW = []
var = 9
tot = 0
# # create 'rough' gaussian
for i in range(-var - 1, var + 2):
if i == -var - 1:
r_pdf = + gaussian(i, 9) + gaussian(i - 1, 9) + gaussian(i - 2, 9)
elif i == var + 1:
r_pdf = + gaussian(i, 9) + gaussian(i + 1, 9) + gaussian(i + 2, 9)
else:
r_pdf = gaussian(i, 9)
tot = tot + r_pdf
pdf_DGV.append(i)
pdf_DGW.append(r_pdf)
print(i, r_pdf)
# # amusing how close tot is to 1!
print('\nRough total = ', tot)
# # no need to normalize with Python 3.6,
# # but can't help ourselves
for i in range(0,len(pdf_DGW)):
pdf_DGW[i] = pdf_DGW[i]/tot
# # print out pdf weights
# # for out discrte gaussian
print('\npdf:\n')
print(pdf_DGW)
# # plot random variable action
rv_samples = random.choices(pdf_DGV, pdf_DGW, k=10000)
plt.hist(rv_samples, bins = 100)
plt.show()
sys.exit()
OUTPUT:
-10 0.0007187932912256041
-9 0.001477282803979336
-8 0.003798662007932481
-7 0.008740629697903166
-6 0.017996988837729353
-5 0.03315904626424957
-4 0.05467002489199788
-3 0.0806569081730478
-2 0.10648266850745075
-1 0.12579440923099774
0 0.1329807601338109
1 0.12579440923099774
2 0.10648266850745075
3 0.0806569081730478
4 0.05467002489199788
5 0.03315904626424957
6 0.017996988837729353
7 0.008740629697903166
8 0.003798662007932481
9 0.001477282803979336
10 0.0007187932912256041
Rough total = 0.9999715875468381
pdf:
[0.000718813714486599, 0.0014773247784004072, 0.003798769940305483, 0.008740878047691289, 0.017997500190860556, 0.033159988420867426, 0.05467157824565407, 0.08065919989878699, 0.10648569402724471, 0.12579798346031068, 0.13298453855078374, 0.12579798346031068, 0.10648569402724471, 0.08065919989878699, 0.05467157824565407, 0.033159988420867426, 0.017997500190860556, 0.008740878047691289, 0.003798769940305483, 0.0014773247784004072, 0.000718813714486599]
I'm not sure if there (in scipy generator) is an option of var-type choice to be generated, but common generation can be such with scipy.stats
# Generate pseudodata from a single normal distribution
import scipy
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
dist_mean = 0.0
dist_std = 0.5
n_events = 500
toy_data = scipy.stats.norm.rvs(dist_mean, dist_std, size=n_events)
toy_data2 = [[i, j] for i, j in enumerate(toy_data )]
arr = np.array(toy_data2)
print("sample:\n", arr[1:500, 0])
print("bin:\n",arr[1:500, 1])
plt.scatter(arr[1:501, 1], arr[1:501, 0])
plt.xlabel("bin")
plt.ylabel("sample")
plt.show()
or in such a way (also option of dtype choice is absent):
import matplotlib.pyplot as plt
mu, sigma = 0, 0.1 # mean and standard deviation
s = np.random.normal(mu, sigma, 500)
count, bins, ignored = plt.hist(s, 30, density=True)
plt.show()
print(bins) # <<<<<<<<<<
plt.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) * np.exp( - (bins - mu)**2 / (2 * sigma**2) ),
linewidth=2, color='r')
plt.show()
without visualization the most common way (also no possibility to point out var-type)
bins = np.random.normal(3, 2.5, size=(10, 1))
a wrapper class could be done to instantiate the container with a given vars-dtype (e.g. by rounding floats to integers, as mentioned above)...
I have the following boundary conditions for a time series in python.
The notation I use here is t_x, where x describe the time in milliseconds (this is not my code, I just thought this notation is good to explain my issue).
t_0 = 0
t_440 = -1.6
t_830 = 0
mean_value = -0.6
I want to create a list that contains 83 values (so the spacing is 10ms for each value).
The list should descibe a "curve" that starts at zero, has the minimum value of -1.6 at 440ms (so 44 in the list), ends with 0 at 880ms (so 83 in the list) and the overall mean value of the list should be -0.6.
I absolutely could not come up with an idea how to "fit" the boundaries to create such a list.
I would really appreciate help.
It is a quick and dirty approach, but it works:
X = list(range(0, 830 +1, 10))
Y = [0.0 for x in X]
Y[44] = -1.6
b = 12.3486
for x in range(44):
Y[x] = -1.6*(b*x+x**2)/(b*44+44**2)
for x in range(83, 44, -1):
Y[x] = -1.6*(b*(83-x)+(83-x)**2)/(b*38+38**2)
print(f'{sum(Y)/len(Y)=:8.6f}, {Y[0]=}, {Y[44]=}, {Y[83]=}')
from matplotlib import pyplot as plt
plt.plot(X,Y)
plt.show()
With the code giving following output:
sum(Y)/len(Y)=-0.600000, Y[0]=-0.0, Y[44]=-1.6, Y[83]=-0.0
And showing following diagram:
The first step in coming up with the above approach was to create a linear sloping 'curve' from the minimum to the zeroes. I turned out that linear approach gives here too large mean Y value what means that the 'curve' must have a sharp peak at its minimum and need to be approached with a polynomial. To make things simple I decided to use quadratic polynomial and approach the minimum from left and right side separately as the curve isn't symmetric. The b-value was found by trial and error and its precision can be increased manually or by writing a small function finding it in an iterative way.
Update providing a generic solution as requested in a comment
The code below provides a
meanYboundaryXY(lbc = [(0,0), (440,-1.6), (830,0), -0.6], shape='saw')
function returning the X and Y lists of the time series data calculated from the passed parameter with the boundary values:
def meanYboundaryXY(lbc = [(0,0), (440,-1.6), (830,0), -0.6]):
lbcXY = lbc[0:3] ; meanY_boundary = lbc[3]
minX = min(x for x,y in lbcXY)
maxX = max(x for x,y in lbcXY)
minY = lbc[1][1]
step = 10
X = list(range(minX, maxX + 1, step))
lenX = len(X)
Y = [None for x in X]
sumY = 0
for x, y in lbcXY:
Y[x//step] = y
sumY += y
target_sumY = meanY_boundary*lenX
if shape == 'rect':
subY = (target_sumY-sumY)/(lenX-3)
for i, y in enumerate(Y):
if y is None:
Y[i] = subY
elif shape == 'saw':
peakNextY = 2*(target_sumY-sumY)/(lenX-1)
iYleft = lbc[1][0]//step-1
iYrght = iYleft+2
iYstart = lbc[0][0] // step
iYend = lbc[2][0] // step
for i in range(iYstart, iYleft+1, 1):
Y[i] = peakNextY * i / iYleft
for i in range(iYend, iYrght-1, -1):
Y[i] = peakNextY * (iYend-i)/(iYend-iYrght)
else:
raise ValueError( str(f'meanYboundaryXY() EXIT, {shape=} not in ["saw","rect"]') )
return (X, Y)
X, Y = meanYboundaryXY()
print(f'{sum(Y)/len(Y)=:8.6f}, {Y[0]=}, {Y[44]=}, {Y[83]=}')
from matplotlib import pyplot as plt
plt.plot(X,Y)
plt.show()
The code outputs:
sum(Y)/len(Y)=-0.600000, Y[0]=0, Y[44]=-1.6, Y[83]=0
and creates following two diagrams for shape='rect' and shape='saw':
As an old geek, i try to solve the question with a simple algorithm.
First calculate points as two symmetric lines from 0 to 44 and 44 to 89 (orange on the graph).
Calculate sum except middle point and its ratio with sum of points when mean is -0.6, except middle point.
Apply ratio to previous points except middle point. (blue curve on the graph)
Obtain curve which was called "saw" by Claudio.
For my own, i think quadratic interpolation of Claudio is a better curve, but needs trial and error loops.
import matplotlib
# define goals
nbPoints = 89
msPerPoint = 10
midPoint = nbPoints//2
valueMidPoint = -1.6
meanGoal = -0.6
def createSerieLinear():
# two lines 0 up to 44, 44 down to 88 (89 values centered on 44)
serie=[0 for i in range(0,nbPoints)]
interval =valueMidPoint/midPoint
for i in range(0,midPoint+1):
serie[i]=i*interval
serie[nbPoints-1-i]=i*interval
return serie
# keep an original to plot
orange = createSerieLinear()
# work on a base
base = createSerieLinear()
# total except midPoint
totalBase = (sum(base)-valueMidPoint)
#total goal except 44
totalGoal = meanGoal*nbPoints - valueMidPoint
# apply ratio to reduce
reduceRatio = totalGoal/totalBase
for i in range(0,midPoint):
base[i] *= reduceRatio
base[nbPoints-1-i] *= reduceRatio
# verify
meanBase = sum(base)/nbPoints
print("new mean:",meanBase)
# draw
from matplotlib import pyplot as plt
X =[i*msPerPoint for i in range(0,nbPoints)]
plt.plot(X,base)
plt.plot(X,orange)
plt.show()
new mean: -0.5999999999999998
Hope you enjoy simple things :)
Is there a numpy builtin to do something like the following? That is, take a list d and return a list filtered_d with any outlying elements removed based on some assumed distribution of the points in d.
import numpy as np
def reject_outliers(data):
m = 2
u = np.mean(data)
s = np.std(data)
filtered = [e for e in data if (u - 2 * s < e < u + 2 * s)]
return filtered
>>> d = [2,4,5,1,6,5,40]
>>> filtered_d = reject_outliers(d)
>>> print filtered_d
[2,4,5,1,6,5]
I say 'something like' because the function might allow for varying distributions (poisson, gaussian, etc.) and varying outlier thresholds within those distributions (like the m I've used here).
Something important when dealing with outliers is that one should try to use estimators as robust as possible. The mean of a distribution will be biased by outliers but e.g. the median will be much less.
Building on eumiro's answer:
def reject_outliers(data, m = 2.):
d = np.abs(data - np.median(data))
mdev = np.median(d)
s = d/mdev if mdev else np.zero(len(d))
return data[s<m]
Here I have replace the mean with the more robust median and the standard deviation with the median absolute distance to the median. I then scaled the distances by their (again) median value so that m is on a reasonable relative scale.
Note that for the data[s<m] syntax to work, data must be a numpy array.
This method is almost identical to yours, just more numpyst (also working on numpy arrays only):
def reject_outliers(data, m=2):
return data[abs(data - np.mean(data)) < m * np.std(data)]
Benjamin Bannier's answer yields a pass-through when the median of distances from the median is 0, so I found this modified version a bit more helpful for cases as given in the example below.
def reject_outliers_2(data, m=2.):
d = np.abs(data - np.median(data))
mdev = np.median(d)
s = d / (mdev if mdev else 1.)
return data[s < m]
Example:
data_points = np.array([10, 10, 10, 17, 10, 10])
print(reject_outliers(data_points))
print(reject_outliers_2(data_points))
Gives:
[[10, 10, 10, 17, 10, 10]] # 17 is not filtered
[10, 10, 10, 10, 10] # 17 is filtered (it's distance, 7, is greater than m)
Building on Benjamin's, using pandas.Series, and replacing MAD with IQR:
def reject_outliers(sr, iq_range=0.5):
pcnt = (1 - iq_range) / 2
qlow, median, qhigh = sr.dropna().quantile([pcnt, 0.50, 1-pcnt])
iqr = qhigh - qlow
return sr[ (sr - median).abs() <= iqr]
For instance, if you set iq_range=0.6, the percentiles of the interquartile-range would become: 0.20 <--> 0.80, so more outliers will be included.
An alternative is to make a robust estimation of the standard deviation (assuming Gaussian statistics). Looking up online calculators, I see that the 90% percentile corresponds to 1.2815σ and the 95% is 1.645σ (http://vassarstats.net/tabs.html?#z)
As a simple example:
import numpy as np
# Create some random numbers
x = np.random.normal(5, 2, 1000)
# Calculate the statistics
print("Mean= ", np.mean(x))
print("Median= ", np.median(x))
print("Max/Min=", x.max(), " ", x.min())
print("StdDev=", np.std(x))
print("90th Percentile", np.percentile(x, 90))
# Add a few large points
x[10] += 1000
x[20] += 2000
x[30] += 1500
# Recalculate the statistics
print()
print("Mean= ", np.mean(x))
print("Median= ", np.median(x))
print("Max/Min=", x.max(), " ", x.min())
print("StdDev=", np.std(x))
print("90th Percentile", np.percentile(x, 90))
# Measure the percentile intervals and then estimate Standard Deviation of the distribution, both from median to the 90th percentile and from the 10th to 90th percentile
p90 = np.percentile(x, 90)
p10 = np.percentile(x, 10)
p50 = np.median(x)
# p50 to p90 is 1.2815 sigma
rSig = (p90-p50)/1.2815
print("Robust Sigma=", rSig)
rSig = (p90-p10)/(2*1.2815)
print("Robust Sigma=", rSig)
The output I get is:
Mean= 4.99760520022
Median= 4.95395274981
Max/Min= 11.1226494654 -2.15388472011
Sigma= 1.976629928
90th Percentile 7.52065379649
Mean= 9.64760520022
Median= 4.95667658782
Max/Min= 2205.43861943 -2.15388472011
Sigma= 88.6263902244
90th Percentile 7.60646688694
Robust Sigma= 2.06772555531
Robust Sigma= 1.99878292462
Which is close to the expected value of 2.
If we want to remove points above/below 5 standard deviations (with 1000 points we would expect 1 value > 3 standard deviations):
y = x[abs(x - p50) < rSig*5]
# Print the statistics again
print("Mean= ", np.mean(y))
print("Median= ", np.median(y))
print("Max/Min=", y.max(), " ", y.min())
print("StdDev=", np.std(y))
Which gives:
Mean= 4.99755359935
Median= 4.95213030447
Max/Min= 11.1226494654 -2.15388472011
StdDev= 1.97692712883
I have no idea which approach is the more efficent/robust
I wanted to do something similar, except setting the number to NaN rather than removing it from the data, since if you remove it you change the length which can mess up plotting (i.e. if you're only removing outliers from one column in a table, but you need it to remain the same as the other columns so you can plot them against each other).
To do so I used numpy's masking functions:
def reject_outliers(data, m=2):
stdev = np.std(data)
mean = np.mean(data)
maskMin = mean - stdev * m
maskMax = mean + stdev * m
mask = np.ma.masked_outside(data, maskMin, maskMax)
print('Masking values outside of {} and {}'.format(maskMin, maskMax))
return mask
I would like to provide two methods in this answer, solution based on "z score" and solution based on "IQR".
The code provided in this answer works on both single dim numpy array and multiple numpy array.
Let's import some modules firstly.
import collections
import numpy as np
import scipy.stats as stat
from scipy.stats import iqr
z score based method
This method will test if the number falls outside the three standard deviations. Based on this rule, if the value is outlier, the method will return true, if not, return false.
def sd_outlier(x, axis = None, bar = 3, side = 'both'):
assert side in ['gt', 'lt', 'both'], 'Side should be `gt`, `lt` or `both`.'
d_z = stat.zscore(x, axis = axis)
if side == 'gt':
return d_z > bar
elif side == 'lt':
return d_z < -bar
elif side == 'both':
return np.abs(d_z) > bar
IQR based method
This method will test if the value is less than q1 - 1.5 * iqr or greater than q3 + 1.5 * iqr, which is similar to SPSS's plot method.
def q1(x, axis = None):
return np.percentile(x, 25, axis = axis)
def q3(x, axis = None):
return np.percentile(x, 75, axis = axis)
def iqr_outlier(x, axis = None, bar = 1.5, side = 'both'):
assert side in ['gt', 'lt', 'both'], 'Side should be `gt`, `lt` or `both`.'
d_iqr = iqr(x, axis = axis)
d_q1 = q1(x, axis = axis)
d_q3 = q3(x, axis = axis)
iqr_distance = np.multiply(d_iqr, bar)
stat_shape = list(x.shape)
if isinstance(axis, collections.Iterable):
for single_axis in axis:
stat_shape[single_axis] = 1
else:
stat_shape[axis] = 1
if side in ['gt', 'both']:
upper_range = d_q3 + iqr_distance
upper_outlier = np.greater(x - upper_range.reshape(stat_shape), 0)
if side in ['lt', 'both']:
lower_range = d_q1 - iqr_distance
lower_outlier = np.less(x - lower_range.reshape(stat_shape), 0)
if side == 'gt':
return upper_outlier
if side == 'lt':
return lower_outlier
if side == 'both':
return np.logical_or(upper_outlier, lower_outlier)
Finally, if you want to filter out the outliers, use a numpy selector.
Have a nice day.
Consider that all the above methods fail when your standard deviation gets very large due to huge outliers.
(Simalar as the average caluclation fails and should rather caluclate the median. Though, the average is "more prone to such an error as the stdDv".)
You could try to iteratively apply your algorithm or you filter using the interquartile range:
(here "factor" relates to a n*sigma range, yet only when your data follows a Gaussian distribution)
import numpy as np
def sortoutOutliers(dataIn,factor):
quant3, quant1 = np.percentile(dataIn, [75 ,25])
iqr = quant3 - quant1
iqrSigma = iqr/1.34896
medData = np.median(dataIn)
dataOut = [ x for x in dataIn if ( (x > medData - factor* iqrSigma) and (x < medData + factor* iqrSigma) ) ]
return(dataOut)
So many answers, but I'm adding a new one that can be useful for the author or even for other users.
You could use the Hampel filter. But you need to work with Series.
Hampel filter returns the Outliers indices, then you can delete them from the Series, and then convert it back to a List.
To use Hampel filter, you can easily install the package with pip:
pip install hampel
Usage:
# Imports
from hampel import hampel
import pandas as pd
list_d = [2, 4, 5, 1, 6, 5, 40]
# List to Series
time_series = pd.Series(list_d)
# Outlier detection with Hampel filter
# Returns the Outlier indices
outlier_indices = hampel(ts = time_series, window_size = 3)
# Drop Outliers indices from Series
filtered_d = time_series.drop(outlier_indices)
filtered_d.values.tolist()
print(f'filtered_d: {filtered_d.values.tolist()}')
And the output will be:
filtered_d: [2, 4, 5, 1, 6, 5]
Where, ts is a pandas Series object and window_size is a total window size will be computed as 2 * window_size + 1.
For this Series I set window_size with the value 3.
The cool thing about working with Series is being able to generate graphics:
# Imports
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
# Plot Original Series
time_series.plot(style = 'k-')
plt.title('Original Series')
plt.show()
# Plot Cleaned Series
filtered_d.plot(style = 'k-')
plt.title('Cleaned Series (Without detected Outliers)')
plt.show()
And the output will be:
To learn more about Hampel filter, I recommend the following readings:
Python implementation of the Hampel Filter
Outlier Detection with Hampel Filter
Clean-up your time series data with a Hampel Filter
if you want to get the index position of the outliers idx_list will return it.
def reject_outliers(data, m = 2.):
d = np.abs(data - np.median(data))
mdev = np.median(d)
s = d/mdev if mdev else 0.
data_range = np.arange(len(data))
idx_list = data_range[s>=m]
return data[s<m], idx_list
data_points = np.array([8, 10, 35, 17, 73, 77])
print(reject_outliers(data_points))
after rejection: [ 8 10 35 17], index positions of outliers: [4 5]
For a set of images (each image has 3 dimensions), where I wanted to reject outliers for each pixel I used:
mean = np.mean(imgs, axis=0)
std = np.std(imgs, axis=0)
mask = np.greater(0.5 * std + 1, np.abs(imgs - mean))
masked = np.multiply(imgs, mask)
Then it is possible to compute the mean:
masked_mean = np.divide(np.sum(masked, axis=0), np.sum(mask, axis=0))
(I use it for Background Subtraction)
Here I find the outliers in x and substitute them with the median of a window of points (win) around them (taking from Benjamin Bannier answer the median deviation)
def outlier_smoother(x, m=3, win=3, plots=False):
''' finds outliers in x, points > m*mdev(x) [mdev:median deviation]
and replaces them with the median of win points around them '''
x_corr = np.copy(x)
d = np.abs(x - np.median(x))
mdev = np.median(d)
idxs_outliers = np.nonzero(d > m*mdev)[0]
for i in idxs_outliers:
if i-win < 0:
x_corr[i] = np.median(np.append(x[0:i], x[i+1:i+win+1]))
elif i+win+1 > len(x):
x_corr[i] = np.median(np.append(x[i-win:i], x[i+1:len(x)]))
else:
x_corr[i] = np.median(np.append(x[i-win:i], x[i+1:i+win+1]))
if plots:
plt.figure('outlier_smoother', clear=True)
plt.plot(x, label='orig.', lw=5)
plt.plot(idxs_outliers, x[idxs_outliers], 'ro', label='outliers')
plt.plot(x_corr, '-o', label='corrected')
plt.legend()
return x_corr
Trim outliers in a numpy array along axis and replace them with min or max values along this axis, whichever is closer. The threshold is z-score:
def np_z_trim(x, threshold=10, axis=0):
""" Replace outliers in numpy ndarray along axis with min or max values
within the threshold along this axis, whichever is closer."""
mean = np.mean(x, axis=axis, keepdims=True)
std = np.std(x, axis=axis, keepdims=True)
masked = np.where(np.abs(x - mean) < threshold * std, x, np.nan)
min = np.nanmin(masked, axis=axis, keepdims=True)
max = np.nanmax(masked, axis=axis, keepdims=True)
repl = np.where(np.abs(x - max) < np.abs(x - min), max, min)
return np.where(np.isnan(masked), repl, masked)
My solution drops the top and bottom percentiles, keeping values that are equal to the boundary:
def remove_percentile_outliers(data, percent_to_drop=0.001):
low, high = data.quantile([percent_to_drop / 2, 1-percent_to_drop / 2])
return data[(data >= low )&(data <= high)]
My solution let the outliers equal to the previous value.
test_data = [2,4,5,1,6,5,40, 3]
def reject_outliers(data, m=2):
mean = np.mean(data)
std = np.std(data)
for i in range(len(data)) :
if np.abs(data[i] -mean) > m*std :
data[i] = data[i-1]
return data
reject_outliers(test_data)
Output:
[2, 4, 5, 1, 6, 5, 5, 3]