I would like to calculate the EWMA Covariance Matrix from a DataFrame of stock price returns using Pandas and have followed the methodology in PyPortfolioOpt.
I like the flexibility of using Pandas objects and functions but when the set of assets grows the function is becomes very slow:
import pandas as pd
import numpy as np
def ewma_cov_pairwise_pd(x, y, alpha=0.06):
x = x.mask(y.isnull(), np.nan)
y = y.mask(x.isnull(), np.nan)
covariation = ((x - x.mean()) * (y - y.mean()).dropna()
return covariation.ewm(alpha=0.06).mean().iloc[-1]
def ewma_cov_pd(rets, alpha=0.06):
assets = rets.columns
n = len(assets)
cov = np.zeros((n, n))
for i in range(n):
for j in range(i, n):
cov[i, j] = cov[j, i] = ewma_cov_pairwise_pd(
rets.iloc[:, i], rets.iloc[:, j], alpha=alpha)
return pd.DataFrame(cov, columns=assets, index=assets)
I would like to improve the speed of the code ideally while still using Pandas but the bottleneck is within the DataFrame.ewm() function which uses 90% of the calculation time.
If using this function was a binding constraint, what is the most efficient way of improving the speed at which the code runs? I was considering taking a brute force approach and using concurrent.futures.ProcessPoolExecutor but perhaps there is a better solutions.
n = 100 # n is typically 2000
rets = pd.DataFrame(np.random.normal(0, 1., size=(n, n)))
cov_pd = ewma_cov_pd(rets)
The true time-series data can contain leading nulls and potentially missing values after that although the latter less likely.
Update I
A potential solution which leverages off the answer provided by Quang Hoang and produces the expected results in a far more reasonable time would be something similar to:
def ewma_cov_frame_qh(rets, alpha=0.06):
weights = (1-alpha) ** np.arange(len(df))[::-1]
normalized = (rets-rets.mean()).to_numpy()
out = (weights * normalized.T) # normalized / weights.sum()
return pd.DataFrame(out, index=rets.columns, columns=rets.columns)
def ewma_cov_qh(rets, alpha=0.06):
syms = rets.columns
covar = pd.DataFrame(index=rets.columns, columns=rets.columns)
delta = rets.isnull().sum(axis=1).shift(1) - rets.isnull().sum(axis=1)
dates = delta.loc[delta != 0].index.tolist()
for date in dates:
frame = rets.loc[rets.index >= date].dropna(axis=1, how='any')
cov = ewma_cov_frame_qh(frame).reindex(index=syms, columns=syms)
covar = covar.fillna(cov)
return covar
cov_qh = ewma_cov_qh(rets)
This violates the requirement that the underlying covariance is calculated using the native Pandas/Numpy functions and calculation time will depend on the number leading na's in the data set.
Update II
A potential improvement on the above which uses (a naive implementation of) multiprocessing and improves the calculation time by a further 42.5% on my machine is listed below:
from concurrent.futures import ProcessPoolExecutor, as_completed
from functools import partial
def ewma_cov_mp_worker(date, rets, alpha=0.06):
syms = rets.columns
frame = rets.loc[rets.index >= date].dropna(axis=1, how='any')
return ewma_cov_frame_qh(frame, alpha=alpha).reindex(index=syms, columns=syms)
def ewma_cov_mp(rets, alpha=0.06):
covar = pd.DataFrame(index=rets.columns, columns=rets.columns)
delta = rets.isnull().sum(axis=1).shift(1) - rets.isnull().sum(axis=1)
dates = delta.loc[delta != 0].index.tolist()
func = partial(ewma_cov_mp_worker, rets=rets, alpha=alpha)
covs = {}
with ProcessPoolExecutor(max_workers=6) as exec:
future_to_date = {exec.submit(func, date): date for date in dates}
covs = {future_to_date[future]: future.result() for future in as_completed(future_to_date)}
for date in dates:
covar.fillna(covs[date], inplace=True)
return covar
[I have not added as answer as not addressed the original question and I am optimistic there is a better solution.]
since you don't really care for ewm, i.e, you only take the last value. We can try matrix multiplication:
def ewma(df, alpha=0.94):
weights = (1-alpha) ** np.arange(len(df))[::-1]
# fillna with 0 here
normalized = (df-df.mean()).fillna(0).to_numpy()
out = ((weights * normalized.T) # normalized / weights.sum()
return out
# verify
out = ewma(df)
print(out[0,1] == ewma_cov_pairwise(df[0],df[1]) )
# True
And this took about 150 ms on my system with df.shape==(2000,2000) while your code refuses to run within minutes :-).
Edited to include VBA code for comparison
Also, we know the analytical value, which is 8.021, towards which the Monte-Carlo should converge, which makes the comparison easier.
Excel VBA gives 8.067 based on averaging 5 Monte-Carlo simulations (7.989, 8.187, 8.045, 8.034, 8.075)
Python gives 7.973 based on 5 MCs (7.913, 7.915, 8.203, 7.739, 8.095) and a larger Variance!
The VBA code is not even "that good", using a rather bad way to produce samples from Standard Normal!
I am running a super simple code in Python to price European Call Option via Monte Carlo, and I am surprised at how "bad" the convergence is with 10,000 "simulated paths". Usually, when running a Monte-Carlo for this simple problem in C++ or even VBA, I get better convergence.
I show the code below (the code is taken from Textbook "Python for Finance" and I run in in Visual Studio Code under Python 3.7.7, 64-bit version): I get the following results, as an example: Run 1 = 7.913, Run 2 = 7.915, Run 3 = 8.203, Run 4 = 7.739, Run 5 = 8.095,
Results such as the above, that differ by so much, would be unacceptable. How can the convergence be improved??? (Obviously by running more paths, but as I said: for 10,000 paths, the result should already have converged much better):
#MonteCarlo valuation of European Call Option
import math
import numpy as np
#Parameter Values
S_0 = 100. # initial value
K = 105. # strike
T = 1.0 # time to maturity
r = 0.05 # short rate (constant)
sigma = 0.2 # vol
nr_simulations = 10000
#Valuation Algo:
# Notice the vectorization below, instead of a loop
z = np.random.standard_normal(nr_simulations)
# Notice that the S_T below is a VECTOR!
S_T = S_0 * np.exp((r-0.5*sigma**2)+math.sqrt(T)*sigma*z)
#Call option pay-off at maturity (Vector!)
C_T = np.maximum((S_T-K),0)
# C_0 is a scalar
C_0 = math.exp(-r*T)*np.average(C_T)
print('Value of the European Call is: ', C_0)
I also include VBA code, which produces slightly better results (in my opinion): with the VBA code below, I get 7.989, 8.187, 8.045, 8.034, 8.075.
Option Explicit
Sub monteCarlo()
' variable declaration
' stock initial & final values, option pay-off at maturity
Dim stockInitial, stockFinal, optionFinal As Double
' r = rate, sigma = volatility, strike = strike price
Dim r, sigma, strike As Double
'maturity of the option
Dim maturity As Double
' instatiate variables
stockInitial = 100#
r = 0.05
maturity = 1#
sigma = 0.2
strike = 105#
' normal is Standard Normal
Dim normal As Double
' randomNr is randomly generated nr via "rnd()" function, between 0 & 1
Dim randomNr As Double
' variable for storing the final result value
Dim result As Double
Dim i, j As Long, monteCarlo As Long
monteCarlo = 10000
For j = 1 To 5
result = 0#
For i = 1 To monteCarlo
' get random nr between 0 and 1
randomNr = Rnd()
'max(Rnd(), 0.000000001)
' standard Normal
normal = Application.WorksheetFunction.Norm_S_Inv(randomNr)
stockFinal = stockInitial * Exp((r - (0.5 * (sigma ^ 2))) + (sigma * Sqr(maturity) * normal))
optionFinal = max((stockFinal - strike), 0)
result = result + optionFinal
Next i
result = result / monteCarlo
result = result * Exp(-r * maturity)
Worksheets("sheet1").Cells(j, 1) = result
Next j
MsgBox "Done"
End Sub
Function max(ByVal number1 As Double, ByVal number2 As Double)
If number1 > number2 Then
max = number1
Else
max = number2
End If
End Function
I don't think there is anything wrong with Python or numpy internals, the convergence is definitely should be the same no matter what tool you're using. I ran a few simulations with different sample sizes and different sigma values. No surprise, it turns out the speed of convergence is heavily controlled by the sigma value, see the plot below. Note that x axis is on log-scale! After the bigger oscillations fade away there are more smaller waves before it stabilizes. The easiest to see at sigma=0.5.
I'm definitely not an expert, but I think the most obvious solution is to increase sample size, as you mentioned. It would be nice to see results and code from C++ or VBA, because I don't know how familiar you are with numpy and python functions. Maybe something is not doing what you think it's doing.
Code to generate the plot (let's not talk about efficiency, it's horrible):
import numpy as np
import matplotlib.pyplot as plt
S_0 = 100. # initial value
K = 105. # strike
T = 1.0 # time to maturity
r = 0.05 # short rate (constant)
fig = plt.figure()
ax = fig.add_subplot()
plt.xscale('log')
samplesize = np.geomspace(1000, 20000000, 64)
sigmas = np.arange(0, 0.7, 0.1)
for s in sigmas:
arr = []
for n in samplesize:
n = n.astype(int)
z = np.random.standard_normal(n)
S_T = S_0 * np.exp((r-0.5*s**2)+np.sqrt(T)*s*z)
C_T = np.maximum((S_T-K),0)
C_0 = np.exp(-r*T)*np.average(C_T)
arr.append(C_0)
ax.scatter(samplesize, arr, label=f'sigma={s:.2f}')
plt.tight_layout()
plt.xlabel('Sample size')
plt.ylabel('Value')
plt.grid()
handles, labels = ax.get_legend_handles_labels()
plt.legend(handles[::-1], labels[::-1], loc='upper left')
plt.show()
Addition:
This time you got closer results to the real value using VBA. But sometimes you don't. The effect of randomness is too big here. The truth is averaging out only 5 results from a low sample number simulation is not meaningful. For example averaging out 50 different simulations in Python (with only n=10000, even though you shouldn't do that if you're willing to get the right answer) yields to 8.025167 (± 0.039717 with 95% confidence level), which is very close to the real solution.
How can i generate a random walk data between a start-end values
while not passing over the maximum value and not going under the minimum value?
Here is my attempt to do this but for some reason sometimes the series goes over the max or under the min values. It seems that the Start and the End value are respected but not the minimum and the maximum value. How can this be fixed? Also i would like to give the standard deviation for the fluctuations but don't know how. I use a randomPerc for fluctuation but this is wrong as i would like to specify the std instead.
import numpy as np
import matplotlib.pyplot as plt
def generateRandomData(length,randomPerc, min,max,start, end):
data_np = (np.random.random(length) - randomPerc).cumsum()
data_np *= (max - min) / (data_np.max() - data_np.min())
data_np += np.linspace(start - data_np[0], end - data_np[-1], len(data_np))
return data_np
randomData=generateRandomData(length = 1000, randomPerc = 0.5, min = 50, max = 100, start = 66, end = 80)
## print values
print("Max Value",randomData.max())
print("Min Value",randomData.min())
print("Start Value",randomData[0])
print("End Value",randomData[-1])
print("Standard deviation",np.std(randomData))
## plot values
plt.figure()
plt.plot(range(randomData.shape[0]), randomData)
plt.show()
plt.close()
Here is a simple loop which checks for series that go under the minimum or over the maximum value. This is exactly what i am trying to avoid. The series should be distributed between the given limits for min and max values.
## generate 1000 series and check if there are any values over the maximum limit or under the minimum limit
for i in range(1000):
randomData = generateRandomData(length = 1000, randomPerc = 0.5, min = 50, max = 100, start = 66, end = 80)
if(randomData.min() < 50):
print(i, "Value Lower than Min limit")
if(randomData.max() > 100):
print(i, "Value Higher than Max limit")
As you impose conditions on your walk, it can not be considered purely random. Anyway, one way is to generate the walk iteratively, and check the boundaries on each iteration. But if you wanted a vectorized solution, here it is:
def bounded_random_walk(length, lower_bound, upper_bound, start, end, std):
assert (lower_bound <= start and lower_bound <= end)
assert (start <= upper_bound and end <= upper_bound)
bounds = upper_bound - lower_bound
rand = (std * (np.random.random(length) - 0.5)).cumsum()
rand_trend = np.linspace(rand[0], rand[-1], length)
rand_deltas = (rand - rand_trend)
rand_deltas /= np.max([1, (rand_deltas.max()-rand_deltas.min())/bounds])
trend_line = np.linspace(start, end, length)
upper_bound_delta = upper_bound - trend_line
lower_bound_delta = lower_bound - trend_line
upper_slips_mask = (rand_deltas-upper_bound_delta) >= 0
upper_deltas = rand_deltas - upper_bound_delta
rand_deltas[upper_slips_mask] = (upper_bound_delta - upper_deltas)[upper_slips_mask]
lower_slips_mask = (lower_bound_delta-rand_deltas) >= 0
lower_deltas = lower_bound_delta - rand_deltas
rand_deltas[lower_slips_mask] = (lower_bound_delta + lower_deltas)[lower_slips_mask]
return trend_line + rand_deltas
randomData = bounded_random_walk(1000, lower_bound=50, upper_bound =100, start=50, end=100, std=10)
You can see it as a solution of geometric problem. The trend_line is connecting your start and end points, and have margins defined by lower_bound and upper_bound. rand is your random walk, rand_trend it's trend line and rand_deltas is it's deviation from the rand trend line. We collocate the trend lines, and want to make sure that deltas don't exceed margins. When rand_deltas exceeds the allowed margin, we "fold" the excess back to the bounds.
At the end you add the resulting random deltas to the start=>end trend line, thus receiving the desired bounded random walk.
The std parameter corresponds to the amount of variance of the random walk.
update : fixed assertions
In this version "std" is not promised to be the "interval".
I noticed you used built in functions as arguments (min and max) which is not reccomended (I changed these to max_1 and min_1). Other than this your code should work as expected:
def generateRandomData(length,randomPerc, min_1,max_1,start, end):
data_np = (np.random.random(length) - randomPerc).cumsum()
data_np *= (max_1 - min_1) / (data_np.max() - data_np.min())
data_np += np.linspace(start - data_np[0], end - data_np[-1],len(data_np))
return data_np
randomData=generateRandomData(1000, 0.5, 50, 100, 66, 80)
If you are willing to modify your code this will work:
import random
for_fill=[]
# generate 1000 samples within the specified range and save them in for_fill
for x in range(1000):
generate_rnd_df=random.uniform(50,100)
for_fill.append(generate_rnd_df)
#set starting and end point manually
for_fill[0]=60
for_fill[999]=80
Here is one way, very crudely expressed in code.
>>> import random
>>> steps = 1000
>>> start = 66
>>> end = 80
>>> step_size = (50,100)
Generate 1,000 steps assured to be within the required range.
>>> crude_walk_steps = [random.uniform(*step_size) for _ in range(steps)]
>>> import numpy as np
Turn these steps into a walk but notice that they fail to meet the requirements.
>>> crude_walk = np.cumsum(crude_walk_steps)
>>> min(crude_walk)
57.099056617839288
>>> max(crude_walk)
75048.948693623403
Calculate a simple linear transformation to scale the steps.
>>> from sympy import *
>>> var('a b')
(a, b)
>>> solve([57.099056617839288*a+b-66,75048.948693623403*a+b-80])
{b: 65.9893403510312, a: 0.000186686954219243}
Scales the steps.
>>> walk = [0.000186686954219243*_+65.9893403510312 for _ in crude_walk]
Verify that the walk now starts and stops where intended.
>>> min(walk)
65.999999999999986
>>> max(walk)
79.999999999999986
You can also generate a stream of random walks and filter out those that do not meet your constraints. Just be aware that by filtering they are not really 'random' anymore.
The code below creates an infinite stream of 'valid' random walks. Be careful with
very tight constraints, the 'next' call might take a while ;).
import itertools
import numpy as np
def make_random_walk(first, last, min_val, max_val, size):
# Generate a sequence of random steps of lenght `size-2`
# that will be taken bewteen the start and stop values.
steps = np.random.normal(size=size-2)
# The walk is the cumsum of those steps
walk = steps.cumsum()
# Performing the walk from the start value gives you your series.
series = walk + first
# Compare the target min and max values with the observed ones.
target_min_max = np.array([min_val, max_val])
observed_min_max = np.array([series.min(), series.max()])
# Calculate the absolute 'overshoot' for min and max values
f = np.array([-1, 1])
overshoot = (observed_min_max*f - target_min_max*f)
# Calculate the scale factor to constrain the walk within the
# target min/max values.
# Don't upscale.
correction_base = [walk.min(), walk.max()][np.argmax(overshoot)]
scale = min(1, (correction_base - overshoot.max()) / correction_base)
# Generate the scaled series
new_steps = steps * scale
new_walk = new_steps.cumsum()
new_series = new_walk + first
# Check the size of the final step necessary to reach the target endpoint.
last_step_size = abs(last - new_series[-1]) # step needed to reach desired end
# Is it larger than the largest previously observed step?
if last_step_size > np.abs(new_steps).max():
# If so, consider this series invalid.
return None
else:
# Else, we found a valid series that meets the constraints.
return np.concatenate((np.array([first]), new_series, np.array([last])))
start = 66
stop = 80
max_val = 100
min_val = 50
size = 1000
# Create an infinite stream of candidate series
candidate_walks = (
(i, make_random_walk(first=start, last=stop, min_val=min_val, max_val=max_val, size=size))
for i in itertools.count()
)
# Filter out the invalid ones.
valid_walks = ((i, w) for i, w in candidate_walks if w is not None)
idx, walk = next(valid_walks) # Get the next valid series
print(
"Walk #{}: min/max({:.2f}/{:.2f})"
.format(idx, walk.min(), walk.max())
)
I am looking for an efficient way to implement a simple filter with one coefficient that is time-varying and specified by a vector with the same length as the input signal.
The following is a simple implementation of the desired behavior:
def myfilter(signal, weights):
output = np.empty_like(weights)
val = signal[0]
for i in range(len(signal)):
val += weights[i]*(signal[i] - val)
output[i] = val
return output
weights = np.random.uniform(0, 0.1, (100,))
signal = np.linspace(1, 3, 100)
output = myfilter(signal, weights)
Is there a way to do this more efficiently with numpy or scipy?
You can trade in the overhead of the loop for a couple of additional ops:
import numpy as np
def myfilter(signal, weights):
output = np.empty_like(weights)
val = signal[0]
for i in range(len(signal)):
val += weights[i]*(signal[i] - val)
output[i] = val
return output
def vectorised(signal, weights):
wp = np.r_[1, np.multiply.accumulate(1 - weights[1:])]
sw = weights * signal
sw[0] = signal[0]
sws = np.add.accumulate(sw / wp)
return wp * sws
weights = np.random.uniform(0, 0.1, (100,))
signal = np.linspace(1, 3, 100)
print(np.allclose(myfilter(signal, weights), vectorised(signal, weights)))
On my machine the vectorised version is several times faster. It uses a "closed form" solution of your recurrence equation.
Edit: For very long signal / weight (100,000 samples, say) this method doesn't work because of overflow. In that regime you can still save a bit (more than 50% on my machine) using the following trick, which has the added bonus that you needn't solve the recurrence formula, only invert it.
from scipy import linalg
def solver(signal, weights):
rw = 1 / weights[1:]
v = np.r_[1, rw, 1-rw, 0]
v.shape = 2, -1
return linalg.solve_banded((1, 0), v, signal)
This trick uses the fact that your recurrence is formally similar to a Gauss elimination on a matrix with only one nonvanishing subdiagonal. It piggybacks on a library function that specialises in doing precisely that.
Actually, quite proud of this one.
Is there a numpy builtin to do something like the following? That is, take a list d and return a list filtered_d with any outlying elements removed based on some assumed distribution of the points in d.
import numpy as np
def reject_outliers(data):
m = 2
u = np.mean(data)
s = np.std(data)
filtered = [e for e in data if (u - 2 * s < e < u + 2 * s)]
return filtered
>>> d = [2,4,5,1,6,5,40]
>>> filtered_d = reject_outliers(d)
>>> print filtered_d
[2,4,5,1,6,5]
I say 'something like' because the function might allow for varying distributions (poisson, gaussian, etc.) and varying outlier thresholds within those distributions (like the m I've used here).
Something important when dealing with outliers is that one should try to use estimators as robust as possible. The mean of a distribution will be biased by outliers but e.g. the median will be much less.
Building on eumiro's answer:
def reject_outliers(data, m = 2.):
d = np.abs(data - np.median(data))
mdev = np.median(d)
s = d/mdev if mdev else np.zero(len(d))
return data[s<m]
Here I have replace the mean with the more robust median and the standard deviation with the median absolute distance to the median. I then scaled the distances by their (again) median value so that m is on a reasonable relative scale.
Note that for the data[s<m] syntax to work, data must be a numpy array.
This method is almost identical to yours, just more numpyst (also working on numpy arrays only):
def reject_outliers(data, m=2):
return data[abs(data - np.mean(data)) < m * np.std(data)]
Benjamin Bannier's answer yields a pass-through when the median of distances from the median is 0, so I found this modified version a bit more helpful for cases as given in the example below.
def reject_outliers_2(data, m=2.):
d = np.abs(data - np.median(data))
mdev = np.median(d)
s = d / (mdev if mdev else 1.)
return data[s < m]
Example:
data_points = np.array([10, 10, 10, 17, 10, 10])
print(reject_outliers(data_points))
print(reject_outliers_2(data_points))
Gives:
[[10, 10, 10, 17, 10, 10]] # 17 is not filtered
[10, 10, 10, 10, 10] # 17 is filtered (it's distance, 7, is greater than m)
Building on Benjamin's, using pandas.Series, and replacing MAD with IQR:
def reject_outliers(sr, iq_range=0.5):
pcnt = (1 - iq_range) / 2
qlow, median, qhigh = sr.dropna().quantile([pcnt, 0.50, 1-pcnt])
iqr = qhigh - qlow
return sr[ (sr - median).abs() <= iqr]
For instance, if you set iq_range=0.6, the percentiles of the interquartile-range would become: 0.20 <--> 0.80, so more outliers will be included.
An alternative is to make a robust estimation of the standard deviation (assuming Gaussian statistics). Looking up online calculators, I see that the 90% percentile corresponds to 1.2815σ and the 95% is 1.645σ (http://vassarstats.net/tabs.html?#z)
As a simple example:
import numpy as np
# Create some random numbers
x = np.random.normal(5, 2, 1000)
# Calculate the statistics
print("Mean= ", np.mean(x))
print("Median= ", np.median(x))
print("Max/Min=", x.max(), " ", x.min())
print("StdDev=", np.std(x))
print("90th Percentile", np.percentile(x, 90))
# Add a few large points
x[10] += 1000
x[20] += 2000
x[30] += 1500
# Recalculate the statistics
print()
print("Mean= ", np.mean(x))
print("Median= ", np.median(x))
print("Max/Min=", x.max(), " ", x.min())
print("StdDev=", np.std(x))
print("90th Percentile", np.percentile(x, 90))
# Measure the percentile intervals and then estimate Standard Deviation of the distribution, both from median to the 90th percentile and from the 10th to 90th percentile
p90 = np.percentile(x, 90)
p10 = np.percentile(x, 10)
p50 = np.median(x)
# p50 to p90 is 1.2815 sigma
rSig = (p90-p50)/1.2815
print("Robust Sigma=", rSig)
rSig = (p90-p10)/(2*1.2815)
print("Robust Sigma=", rSig)
The output I get is:
Mean= 4.99760520022
Median= 4.95395274981
Max/Min= 11.1226494654 -2.15388472011
Sigma= 1.976629928
90th Percentile 7.52065379649
Mean= 9.64760520022
Median= 4.95667658782
Max/Min= 2205.43861943 -2.15388472011
Sigma= 88.6263902244
90th Percentile 7.60646688694
Robust Sigma= 2.06772555531
Robust Sigma= 1.99878292462
Which is close to the expected value of 2.
If we want to remove points above/below 5 standard deviations (with 1000 points we would expect 1 value > 3 standard deviations):
y = x[abs(x - p50) < rSig*5]
# Print the statistics again
print("Mean= ", np.mean(y))
print("Median= ", np.median(y))
print("Max/Min=", y.max(), " ", y.min())
print("StdDev=", np.std(y))
Which gives:
Mean= 4.99755359935
Median= 4.95213030447
Max/Min= 11.1226494654 -2.15388472011
StdDev= 1.97692712883
I have no idea which approach is the more efficent/robust
I wanted to do something similar, except setting the number to NaN rather than removing it from the data, since if you remove it you change the length which can mess up plotting (i.e. if you're only removing outliers from one column in a table, but you need it to remain the same as the other columns so you can plot them against each other).
To do so I used numpy's masking functions:
def reject_outliers(data, m=2):
stdev = np.std(data)
mean = np.mean(data)
maskMin = mean - stdev * m
maskMax = mean + stdev * m
mask = np.ma.masked_outside(data, maskMin, maskMax)
print('Masking values outside of {} and {}'.format(maskMin, maskMax))
return mask
I would like to provide two methods in this answer, solution based on "z score" and solution based on "IQR".
The code provided in this answer works on both single dim numpy array and multiple numpy array.
Let's import some modules firstly.
import collections
import numpy as np
import scipy.stats as stat
from scipy.stats import iqr
z score based method
This method will test if the number falls outside the three standard deviations. Based on this rule, if the value is outlier, the method will return true, if not, return false.
def sd_outlier(x, axis = None, bar = 3, side = 'both'):
assert side in ['gt', 'lt', 'both'], 'Side should be `gt`, `lt` or `both`.'
d_z = stat.zscore(x, axis = axis)
if side == 'gt':
return d_z > bar
elif side == 'lt':
return d_z < -bar
elif side == 'both':
return np.abs(d_z) > bar
IQR based method
This method will test if the value is less than q1 - 1.5 * iqr or greater than q3 + 1.5 * iqr, which is similar to SPSS's plot method.
def q1(x, axis = None):
return np.percentile(x, 25, axis = axis)
def q3(x, axis = None):
return np.percentile(x, 75, axis = axis)
def iqr_outlier(x, axis = None, bar = 1.5, side = 'both'):
assert side in ['gt', 'lt', 'both'], 'Side should be `gt`, `lt` or `both`.'
d_iqr = iqr(x, axis = axis)
d_q1 = q1(x, axis = axis)
d_q3 = q3(x, axis = axis)
iqr_distance = np.multiply(d_iqr, bar)
stat_shape = list(x.shape)
if isinstance(axis, collections.Iterable):
for single_axis in axis:
stat_shape[single_axis] = 1
else:
stat_shape[axis] = 1
if side in ['gt', 'both']:
upper_range = d_q3 + iqr_distance
upper_outlier = np.greater(x - upper_range.reshape(stat_shape), 0)
if side in ['lt', 'both']:
lower_range = d_q1 - iqr_distance
lower_outlier = np.less(x - lower_range.reshape(stat_shape), 0)
if side == 'gt':
return upper_outlier
if side == 'lt':
return lower_outlier
if side == 'both':
return np.logical_or(upper_outlier, lower_outlier)
Finally, if you want to filter out the outliers, use a numpy selector.
Have a nice day.
Consider that all the above methods fail when your standard deviation gets very large due to huge outliers.
(Simalar as the average caluclation fails and should rather caluclate the median. Though, the average is "more prone to such an error as the stdDv".)
You could try to iteratively apply your algorithm or you filter using the interquartile range:
(here "factor" relates to a n*sigma range, yet only when your data follows a Gaussian distribution)
import numpy as np
def sortoutOutliers(dataIn,factor):
quant3, quant1 = np.percentile(dataIn, [75 ,25])
iqr = quant3 - quant1
iqrSigma = iqr/1.34896
medData = np.median(dataIn)
dataOut = [ x for x in dataIn if ( (x > medData - factor* iqrSigma) and (x < medData + factor* iqrSigma) ) ]
return(dataOut)
So many answers, but I'm adding a new one that can be useful for the author or even for other users.
You could use the Hampel filter. But you need to work with Series.
Hampel filter returns the Outliers indices, then you can delete them from the Series, and then convert it back to a List.
To use Hampel filter, you can easily install the package with pip:
pip install hampel
Usage:
# Imports
from hampel import hampel
import pandas as pd
list_d = [2, 4, 5, 1, 6, 5, 40]
# List to Series
time_series = pd.Series(list_d)
# Outlier detection with Hampel filter
# Returns the Outlier indices
outlier_indices = hampel(ts = time_series, window_size = 3)
# Drop Outliers indices from Series
filtered_d = time_series.drop(outlier_indices)
filtered_d.values.tolist()
print(f'filtered_d: {filtered_d.values.tolist()}')
And the output will be:
filtered_d: [2, 4, 5, 1, 6, 5]
Where, ts is a pandas Series object and window_size is a total window size will be computed as 2 * window_size + 1.
For this Series I set window_size with the value 3.
The cool thing about working with Series is being able to generate graphics:
# Imports
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
# Plot Original Series
time_series.plot(style = 'k-')
plt.title('Original Series')
plt.show()
# Plot Cleaned Series
filtered_d.plot(style = 'k-')
plt.title('Cleaned Series (Without detected Outliers)')
plt.show()
And the output will be:
To learn more about Hampel filter, I recommend the following readings:
Python implementation of the Hampel Filter
Outlier Detection with Hampel Filter
Clean-up your time series data with a Hampel Filter
if you want to get the index position of the outliers idx_list will return it.
def reject_outliers(data, m = 2.):
d = np.abs(data - np.median(data))
mdev = np.median(d)
s = d/mdev if mdev else 0.
data_range = np.arange(len(data))
idx_list = data_range[s>=m]
return data[s<m], idx_list
data_points = np.array([8, 10, 35, 17, 73, 77])
print(reject_outliers(data_points))
after rejection: [ 8 10 35 17], index positions of outliers: [4 5]
For a set of images (each image has 3 dimensions), where I wanted to reject outliers for each pixel I used:
mean = np.mean(imgs, axis=0)
std = np.std(imgs, axis=0)
mask = np.greater(0.5 * std + 1, np.abs(imgs - mean))
masked = np.multiply(imgs, mask)
Then it is possible to compute the mean:
masked_mean = np.divide(np.sum(masked, axis=0), np.sum(mask, axis=0))
(I use it for Background Subtraction)
Here I find the outliers in x and substitute them with the median of a window of points (win) around them (taking from Benjamin Bannier answer the median deviation)
def outlier_smoother(x, m=3, win=3, plots=False):
''' finds outliers in x, points > m*mdev(x) [mdev:median deviation]
and replaces them with the median of win points around them '''
x_corr = np.copy(x)
d = np.abs(x - np.median(x))
mdev = np.median(d)
idxs_outliers = np.nonzero(d > m*mdev)[0]
for i in idxs_outliers:
if i-win < 0:
x_corr[i] = np.median(np.append(x[0:i], x[i+1:i+win+1]))
elif i+win+1 > len(x):
x_corr[i] = np.median(np.append(x[i-win:i], x[i+1:len(x)]))
else:
x_corr[i] = np.median(np.append(x[i-win:i], x[i+1:i+win+1]))
if plots:
plt.figure('outlier_smoother', clear=True)
plt.plot(x, label='orig.', lw=5)
plt.plot(idxs_outliers, x[idxs_outliers], 'ro', label='outliers')
plt.plot(x_corr, '-o', label='corrected')
plt.legend()
return x_corr
Trim outliers in a numpy array along axis and replace them with min or max values along this axis, whichever is closer. The threshold is z-score:
def np_z_trim(x, threshold=10, axis=0):
""" Replace outliers in numpy ndarray along axis with min or max values
within the threshold along this axis, whichever is closer."""
mean = np.mean(x, axis=axis, keepdims=True)
std = np.std(x, axis=axis, keepdims=True)
masked = np.where(np.abs(x - mean) < threshold * std, x, np.nan)
min = np.nanmin(masked, axis=axis, keepdims=True)
max = np.nanmax(masked, axis=axis, keepdims=True)
repl = np.where(np.abs(x - max) < np.abs(x - min), max, min)
return np.where(np.isnan(masked), repl, masked)
My solution drops the top and bottom percentiles, keeping values that are equal to the boundary:
def remove_percentile_outliers(data, percent_to_drop=0.001):
low, high = data.quantile([percent_to_drop / 2, 1-percent_to_drop / 2])
return data[(data >= low )&(data <= high)]
My solution let the outliers equal to the previous value.
test_data = [2,4,5,1,6,5,40, 3]
def reject_outliers(data, m=2):
mean = np.mean(data)
std = np.std(data)
for i in range(len(data)) :
if np.abs(data[i] -mean) > m*std :
data[i] = data[i-1]
return data
reject_outliers(test_data)
Output:
[2, 4, 5, 1, 6, 5, 5, 3]