Python: ValueError: Input must be 1- or 2-d - python

I have this code to estimate a model using a tobit regression in Python. This is the code which is parsed in three parts: data definition, the estimator builder and estimation.
import numpy as np
from scipy.optimize import minimize
# define the dependent variable and independent variables
X = data.iloc[:, 1:]
y = data.iloc[:, 0]
# Add a column of ones to the independent variables for the constant term
X = np.c_[np.ones(X.shape[0]), X]
# Define the likelihood function for the Tobit model
def likelihood(params, y, X, lower, upper):
beta = params[:-1]
sigma = params[-1]
mu = X # beta
prob = (1 / (sigma * np.sqrt(2 * np.pi)) * np.exp(-0.5 * ((y - mu) / sigma)**2))
prob[y < lower] = 0
prob[y > upper] = 0
return -np.log(prob).sum()
# Set the initial values for the parameters and the lower and upper bounds for censoring
params_init = np.random.normal(size=X.shape[1] + 1)
bounds = [(None, None) for i in range(X.shape[1])] + [(1e-10, None)]
# Perform the MLE estimation
res = minimize(likelihood, params_init, args=(y, X, 0, 100), bounds=bounds, method='L-BFGS-B')
# Extract the estimated parameters and their standard errors
params = res.x
stderr = np.sqrt(np.diag(res.hess_inv))
# Print the results
print(f'Coefficients: {params[:-1]}')
print(f'Standard Errors: {stderr[:-1]}')
print(f'Sigma: {params[-1]:.4f}')
Why am I getting this error message?
Thank you.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-245-5f39f416cc07> in <module>
31 # Extract the estimated parameters and their standard errors
32 params = res.x
---> 33 stderr = np.sqrt(np.diag(res.hess_inv))
34
35 # Print the results
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/overrides.py in diag(*args, **kwargs)
/opt/anaconda3/lib/python3.8/site-packages/numpy/lib/twodim_base.py in diag(v, k)
307 return diagonal(v, k)
308 else:
--> 309 raise ValueError("Input must be 1- or 2-d.")
310
311
ValueError: Input must be 1- or 2-d.
EDIT: If you wanna look at the type of data I'm dealing with, you can simulate them using these lines of code I just wrote:
data = pd.DataFrame()
# Append 'interview probabilities' for individuals with and without disabilities
interview_prob_disabled = np.random.normal(38.63, 28.72, 619)
interview_prob_enabled = np.random.normal(44.27, 28.19, 542)
interview_prob = np.append(interview_prob_disabled, interview_prob_enabled)
# Correct the variable by its mean and standard deviation, without it being negative, nor exceeding 100, nor a float
interview_prob = np.clip(interview_prob, 0, 100)
interview_prob = np.round(interview_prob)
# Add the 'interview probabilities' variable to the dataframe
data['Interview Probabilities'] = interview_prob
# Add other variables such as age, gender, employment status, education, etc.
data['Age'] = np.random.randint(18, 65, size=len(interview_prob))
data['Gender'] = np.random.choice(['Male', 'Female'], size=len(interview_prob))
data['Employment Status'] = np.random.choice(['Employed', 'Unemployed', 'Retired'], size=len(interview_prob))
data['Education Level'] = np.random.choice(['High School', 'College', 'Vocational', 'Graduate School'], size=len(interview_prob))
# Add a 'disability status' variable as a dummy
data['Disability Status'] = np.append(np.repeat('Disabled', 619), np.repeat('Non-disabled', 542))
# Categorical variables
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})
data['Employment Status'] = data['Employment Status'].map({'Employed': 0, 'Unemployed': 1})
data['Education Level'] = data['Education Level'].map({'High School': 0, 'College': 1, 'Vocational': 2, 'Graduate School': 3})
data['Disability Status'] = data['Disability Status'].map({'Disabled': 1, 'Non-disabled': 0})
# Print the df
data

The problem is that your solver, L-BFGS-B yields a LbfgsInvHessProduct object (a linear operator) out of .hess_inv instead of a numpy array (which something like BFGS would give).
One solution to your problem would be to use res.hess_inv.todense() instead.

Related

AssertionError: Total area is zero in defuzzification

my following code is getting the error: "AssertionError: Total area is zero in defuzzification!" im honestly really trying to understand what is wrong and its giving me the following error however im at a dead end. if anyone has some solution it would be appreciated. the gist of the code below is to use fuzzy logic in combination with Vader to clasify whether a text is negative or positive.
x_p = np.arange(0, 1, 0.1)
x_n = np.arange(0, 1, 0.1)
x_op = np.arange(0, 10, 1)
p_lo = fuzz.trimf(x_p, [0, 0, 0.5])
p_md = fuzz.trimf(x_p, [0, 0.5, 1])
p_hi = fuzz.trimf(x_p, [0.5, 1, 1])
n_lo = fuzz.trimf(x_n, [0, 0, 0.5])
n_md = fuzz.trimf(x_n, [0, 0.5, 1])
n_hi = fuzz.trimf(x_n, [0.5, 1, 1])
op_Neg = fuzz.trimf(x_op, [0, 0, 5]) # Scale : Neg Neu Pos
op_Neu = fuzz.trimf(x_op, [0, 5, 10])
op_Pos = fuzz.trimf(x_op, [5, 10, 10])
sid = SentimentIntensityAnalyzer()
sentiment_val=[]
sentiment_doc=[]
for j in range(doclen):
sentiment_doc.append(senti[j])
ss = sid.polarity_scores(tweets[j])
posscore=ss['pos']
negscore=ss['neg']
neuscore=ss['neu']
compoundscore=ss['compound']
print(str(j+1)+" {:-<65} {}".format(tweets[j], str(ss)))
print("\nPositive Score for each tweet :")
if (posscore==1):
posscore=0.9
else:
posscore=round(posscore,1)
print(posscore)
print("\nNegative Score for each tweet :")
if (negscore==1):
negscore=0.9
else:
negscore=round(negscore,1)
print(negscore)
# We need the activation of our fuzzy membership functions at these values.
p_level_lo = fuzz.interp_membership(x_p, p_lo, posscore)
p_level_md = fuzz.interp_membership(x_p, p_md, posscore)
p_level_hi = fuzz.interp_membership(x_p, p_hi, posscore)
n_level_lo = fuzz.interp_membership(x_n, n_lo, negscore)
n_level_md = fuzz.interp_membership(x_n, n_md, negscore)
n_level_hi = fuzz.interp_membership(x_n, n_hi, negscore)
# Now we take our rules and apply them. Rule 1 concerns bad food OR nice.
# The OR operator means we take the maximum of these two.
active_rule1 = np.fmin(p_level_lo, n_level_lo)
active_rule2 = np.fmin(p_level_md, n_level_lo)
active_rule3 = np.fmin(p_level_hi, n_level_lo)
active_rule4 = np.fmin(p_level_lo, n_level_md)
active_rule5 = np.fmin(p_level_md, n_level_md)
active_rule6 = np.fmin(p_level_hi, n_level_md)
active_rule7 = np.fmin(p_level_lo, n_level_hi)
active_rule8 = np.fmin(p_level_md, n_level_hi)
active_rule9 = np.fmin(p_level_hi, n_level_hi)
# Now we apply this by clipping the top off the corresponding output
# membership function with `np.fmin`
n1=np.fmax(active_rule4,active_rule7)
n2=np.fmax(n1,active_rule8)
op_activation_lo = np.fmin(n2,op_Neg)
neu1=np.fmax(active_rule1,active_rule5)
neu2=np.fmax(neu1,active_rule9)
op_activation_md = np.fmin(neu2,op_Neu)
p1=np.fmax(active_rule2,active_rule3)
p2=np.fmax(p1,active_rule6)
op_activation_hi = np.fmin(p2,op_Pos)
op0 = np.zeros_like(x_op)
# Aggregate all three output membership functions together
aggregated = np.fmax(op_activation_lo,
np.fmax(op_activation_md, op_activation_hi))
# Calculate defuzzified result
op = fuzz.defuzz(x_op, aggregated, 'centroid')
output=round(op,2)
op_activation = fuzz.interp_membership(x_op, aggregated, op) # for plot
if 0<(output)<3.33: # R
print("\nOutput after Defuzzification: Negative")
sentiment.append("Negative")
sentiment_val.append('0')
elif 3.34<(output)<10:
print("\nOutput after Defuzzification: Positive")
sentiment.append("Positive")
sentiment_val.append('1')
print("Doc sentiment: " +str(senti[j])+"\n")
traceback is the following:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
/var/folders/1c/pf8ljm0n5d7_w36ty_m7hyhw0000gn/T/ipykernel_1538/2987240111.py in <module>
151
152 # Calculate defuzzified result
--> 153 op = fuzz.defuzz(x_op, aggregated, 'centroid')
154 output=round(op,2)
155
~/opt/anaconda3/lib/python3.9/site-packages/skfuzzy/defuzzify/defuzz.py in defuzz(x, mfx, mode)
246 if 'centroid' in mode or 'bisector' in mode:
247 zero_truth_degree = mfx.sum() == 0 # Approximation of total area
--> 248 assert not zero_truth_degree, 'Total area is zero in defuzzification!'
249
250 if 'centroid' in mode:
AssertionError: Total area is zero in defuzzification!

ValueError: Stop argument for islice() must be None or an integer: 0 <= x <= sys.maxsize on topic coherence

im following this tutorials https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0 and find problem. so my purpose on this code to make iterate it over the range of topics, alpha, and beta parameter values. so I can determine the optimal number of topics from the coherence score generated by alpha and beta
def compute_coherence_values(corpus, dictionary, k, a, b):
lda_model = gensim.models.LdaMulticore(corpus=corpus,
id2word=id2word,
num_topics=10,
random_state=100,
chunksize=100,
passes=10,
alpha=a,
eta=b,
per_word_topics=True)
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
return coherence_model_lda.get_coherence()
and then
import numpy as np
import tqdm
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25),
# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5),
gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75),
corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
'Topics': [],
'Alpha': [],
'Beta': [],
'Coherence': []
}
# Can take a long time to run
if 1 == 1:
pbar = tqdm.tqdm(total=540)
# iterate through validation corpuses
for i in range(len(corpus_sets)):
# iterate through number of topics
for k in topics_range:
# iterate through alpha values
for a in alpha:
# iterare through beta values
for b in beta:
# get the coherence score for the given parameters
cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word,
k=k, a=a, b=b)
# Save the model results
model_results['Validation_Set'].append(corpus_title[i])
model_results['Topics'].append(k)
model_results['Alpha'].append(a)
model_results['Beta'].append(b)
model_results['Coherence'].append(cv)
pbar.update(1)
pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
pbar.close()
come out this error ValueError: Stop argument for islice() must be None or an integer: 0 <= x <= sys.maxsize.
took me forever to figure this out but this is what you need to change where you have num_of_doc*.075 in the corpus_set change it to int(num_of_docs*0.75) and it will run. it will take a long time but it will get rid of the error

DeprecationWarning: object of type <class 'float'> cannot be safely interpreted as an integer

My code snippet is as follows:
data,l = make_moons(100000)
s = np.random.permutation(100000);
temp1 = data[s[0:200],:] # Random Sampling of 200 columns
temp2 = cdist(data,temp1) # Pairwise distance between two sets of observations
C = np.exp(-temp2/sig)
W = C[s[0:200],:]
where make_moons is defined as:
from math import pi
def make_moons(n):
"""Create a 'two moons' dataset with n feature vectors,
and 2 features per vector."""
assert n%2==0, 'n must be even'
# create upper moon
theta = np.linspace(-pi / 2, pi / 2, n/2)
# create lower moon
x = np.r_[np.sin(theta) - pi / 4, np.sin(theta)]
y = np.r_[np.cos(theta), -np.cos(theta) + .5]
data = np.c_[x, y]
# Add some noise
data = data + 0.03 * np.random.standard_normal(data.shape)
# create labels
labels = np.r_[np.ones((n//2, 1)), -np.ones((n//2, 1))]
labels = labels.ravel().astype(np.int32)
return data,labels
I want to get some insight into the captioned warning. As I see, the arrays are floating type but why are they interpreted (or need to be) as integer?

Multiple levels in hierarchical linear regression using PYMC3

I am trying to set up a hierarchical linear regression model using PYMC3. In my particular case, I want to see whether postal codes provide a meaningful structure for other features. Suppose I use the following mock data:
import pandas as pd
import numpy as np
import pymc3 as pm
data = pd.DataFrame({"postalcode": np.floor(np.random.uniform(low=10, high=99, size=1000)),
"x": np.random.normal(size=1000),
"y": np.random.normal(size=1000)})
data["postalcode"] = data["postalcode"].astype(int)
I generate postal codes from 10 to 99, as well as a normally distributed feature x and a target value y. Now I set up my indices for postal code level 1 and level 2:
def create_pc_index(level):
pc = data["postalcode"].astype(str).str[0:level]
unique_pc = pc.unique()
pc_dict = dict(zip(unique_pc, range(0, len(unique_pc))))
return pc_dict, pc.apply(lambda x: pc_dict[x]).values
pc1_dict, pc1_index = create_pc_index(1)
pc2_dict, pc2_index = create_pc_index(2)
Using the first digit of the postal code as hierarchical attribute works fine:
number_of_samples = 1000
x = data["x"]
y = data["y"]
with pm.Model() as model:
sigma = pm.HalfCauchy('sigma', beta=10, testval=0.5, shape=1)
mu_i = pm.Normal("mu_i", 5, sd=25, shape=1)
intercept = pm.Normal('Intercept', mu_i, sd=1, shape=len(pc1_dict))
mu_s = pm.Normal("mu_x", 0, sd=3, shape=1)
x_coeffs = pm.Normal("x", mu_s, 1, shape=len(pc1_dict))
mean = intercept[pc1_index] + x_coeffs[pc1_index] * x
likelihood_mean = pm.Deterministic("mean", mean)
likelihood = pm.Normal('y', mu=likelihood_mean, sd=sigma, observed=y)
trace = pm.sample(number_of_samples)
burned_trace = trace[number_of_samples/2:]
However, if I want to add a second level to my hierarchy (in this case only on the intercept, ignoring x for the moment), I run into shape problems
with pm.Model() as model:
sigma = pm.HalfCauchy('sigma', beta=10, testval=0.5, shape=1)
mu_i_level_1 = pm.Normal("mu_i", 0, sd=25, shape=1)
mu_i_level_2 = pm.Normal("mu_i_level_2", mu_i_level_1, sd=1, shape=len(pc1_dict))
intercept = pm.Normal('Intercept', mu_i_level_2[pc1_index], sd=1, shape=len(pc2_dict))
mu_s = pm.Normal("mu_x", 0, sd=3, shape=1)
x_coeffs = pm.Normal("x", mu_s, 1, shape=len(pc1_dict))
mean = intercept[pc2_index] + x_coeffs[pc1_index] * x
likelihood_mean = pm.Deterministic("mean", mean)
likelihood = pm.Normal('y', mu=likelihood_mean, sd=sigma, observed=y)
trace = pm.sample(number_of_samples)
burned_trace = trace[number_of_samples/2:]
The error message is:
operands could not be broadcast together with shapes (89,) (1000,)
How do I model multiple levels in my regression correctly? Is this just an issue with the correct shape size or is there a more fundamental error on my part?
Thanks in advance!
I don't think intercept can have a shape of len(pc2_dict) but a mu of len(pc1_dict). The contradiction is here:
intercept = pm.Normal('Intercept', mu_i_level_2[pc1_index], sd=1, shape=len(pc2_dict))

Is there a numpy builtin to reject outliers from a list

Is there a numpy builtin to do something like the following? That is, take a list d and return a list filtered_d with any outlying elements removed based on some assumed distribution of the points in d.
import numpy as np
def reject_outliers(data):
m = 2
u = np.mean(data)
s = np.std(data)
filtered = [e for e in data if (u - 2 * s < e < u + 2 * s)]
return filtered
>>> d = [2,4,5,1,6,5,40]
>>> filtered_d = reject_outliers(d)
>>> print filtered_d
[2,4,5,1,6,5]
I say 'something like' because the function might allow for varying distributions (poisson, gaussian, etc.) and varying outlier thresholds within those distributions (like the m I've used here).
Something important when dealing with outliers is that one should try to use estimators as robust as possible. The mean of a distribution will be biased by outliers but e.g. the median will be much less.
Building on eumiro's answer:
def reject_outliers(data, m = 2.):
d = np.abs(data - np.median(data))
mdev = np.median(d)
s = d/mdev if mdev else np.zero(len(d))
return data[s<m]
Here I have replace the mean with the more robust median and the standard deviation with the median absolute distance to the median. I then scaled the distances by their (again) median value so that m is on a reasonable relative scale.
Note that for the data[s<m] syntax to work, data must be a numpy array.
This method is almost identical to yours, just more numpyst (also working on numpy arrays only):
def reject_outliers(data, m=2):
return data[abs(data - np.mean(data)) < m * np.std(data)]
Benjamin Bannier's answer yields a pass-through when the median of distances from the median is 0, so I found this modified version a bit more helpful for cases as given in the example below.
def reject_outliers_2(data, m=2.):
d = np.abs(data - np.median(data))
mdev = np.median(d)
s = d / (mdev if mdev else 1.)
return data[s < m]
Example:
data_points = np.array([10, 10, 10, 17, 10, 10])
print(reject_outliers(data_points))
print(reject_outliers_2(data_points))
Gives:
[[10, 10, 10, 17, 10, 10]] # 17 is not filtered
[10, 10, 10, 10, 10] # 17 is filtered (it's distance, 7, is greater than m)
Building on Benjamin's, using pandas.Series, and replacing MAD with IQR:
def reject_outliers(sr, iq_range=0.5):
pcnt = (1 - iq_range) / 2
qlow, median, qhigh = sr.dropna().quantile([pcnt, 0.50, 1-pcnt])
iqr = qhigh - qlow
return sr[ (sr - median).abs() <= iqr]
For instance, if you set iq_range=0.6, the percentiles of the interquartile-range would become: 0.20 <--> 0.80, so more outliers will be included.
An alternative is to make a robust estimation of the standard deviation (assuming Gaussian statistics). Looking up online calculators, I see that the 90% percentile corresponds to 1.2815σ and the 95% is 1.645σ (http://vassarstats.net/tabs.html?#z)
As a simple example:
import numpy as np
# Create some random numbers
x = np.random.normal(5, 2, 1000)
# Calculate the statistics
print("Mean= ", np.mean(x))
print("Median= ", np.median(x))
print("Max/Min=", x.max(), " ", x.min())
print("StdDev=", np.std(x))
print("90th Percentile", np.percentile(x, 90))
# Add a few large points
x[10] += 1000
x[20] += 2000
x[30] += 1500
# Recalculate the statistics
print()
print("Mean= ", np.mean(x))
print("Median= ", np.median(x))
print("Max/Min=", x.max(), " ", x.min())
print("StdDev=", np.std(x))
print("90th Percentile", np.percentile(x, 90))
# Measure the percentile intervals and then estimate Standard Deviation of the distribution, both from median to the 90th percentile and from the 10th to 90th percentile
p90 = np.percentile(x, 90)
p10 = np.percentile(x, 10)
p50 = np.median(x)
# p50 to p90 is 1.2815 sigma
rSig = (p90-p50)/1.2815
print("Robust Sigma=", rSig)
rSig = (p90-p10)/(2*1.2815)
print("Robust Sigma=", rSig)
The output I get is:
Mean= 4.99760520022
Median= 4.95395274981
Max/Min= 11.1226494654 -2.15388472011
Sigma= 1.976629928
90th Percentile 7.52065379649
Mean= 9.64760520022
Median= 4.95667658782
Max/Min= 2205.43861943 -2.15388472011
Sigma= 88.6263902244
90th Percentile 7.60646688694
Robust Sigma= 2.06772555531
Robust Sigma= 1.99878292462
Which is close to the expected value of 2.
If we want to remove points above/below 5 standard deviations (with 1000 points we would expect 1 value > 3 standard deviations):
y = x[abs(x - p50) < rSig*5]
# Print the statistics again
print("Mean= ", np.mean(y))
print("Median= ", np.median(y))
print("Max/Min=", y.max(), " ", y.min())
print("StdDev=", np.std(y))
Which gives:
Mean= 4.99755359935
Median= 4.95213030447
Max/Min= 11.1226494654 -2.15388472011
StdDev= 1.97692712883
I have no idea which approach is the more efficent/robust
I wanted to do something similar, except setting the number to NaN rather than removing it from the data, since if you remove it you change the length which can mess up plotting (i.e. if you're only removing outliers from one column in a table, but you need it to remain the same as the other columns so you can plot them against each other).
To do so I used numpy's masking functions:
def reject_outliers(data, m=2):
stdev = np.std(data)
mean = np.mean(data)
maskMin = mean - stdev * m
maskMax = mean + stdev * m
mask = np.ma.masked_outside(data, maskMin, maskMax)
print('Masking values outside of {} and {}'.format(maskMin, maskMax))
return mask
I would like to provide two methods in this answer, solution based on "z score" and solution based on "IQR".
The code provided in this answer works on both single dim numpy array and multiple numpy array.
Let's import some modules firstly.
import collections
import numpy as np
import scipy.stats as stat
from scipy.stats import iqr
z score based method
This method will test if the number falls outside the three standard deviations. Based on this rule, if the value is outlier, the method will return true, if not, return false.
def sd_outlier(x, axis = None, bar = 3, side = 'both'):
assert side in ['gt', 'lt', 'both'], 'Side should be `gt`, `lt` or `both`.'
d_z = stat.zscore(x, axis = axis)
if side == 'gt':
return d_z > bar
elif side == 'lt':
return d_z < -bar
elif side == 'both':
return np.abs(d_z) > bar
IQR based method
This method will test if the value is less than q1 - 1.5 * iqr or greater than q3 + 1.5 * iqr, which is similar to SPSS's plot method.
def q1(x, axis = None):
return np.percentile(x, 25, axis = axis)
def q3(x, axis = None):
return np.percentile(x, 75, axis = axis)
def iqr_outlier(x, axis = None, bar = 1.5, side = 'both'):
assert side in ['gt', 'lt', 'both'], 'Side should be `gt`, `lt` or `both`.'
d_iqr = iqr(x, axis = axis)
d_q1 = q1(x, axis = axis)
d_q3 = q3(x, axis = axis)
iqr_distance = np.multiply(d_iqr, bar)
stat_shape = list(x.shape)
if isinstance(axis, collections.Iterable):
for single_axis in axis:
stat_shape[single_axis] = 1
else:
stat_shape[axis] = 1
if side in ['gt', 'both']:
upper_range = d_q3 + iqr_distance
upper_outlier = np.greater(x - upper_range.reshape(stat_shape), 0)
if side in ['lt', 'both']:
lower_range = d_q1 - iqr_distance
lower_outlier = np.less(x - lower_range.reshape(stat_shape), 0)
if side == 'gt':
return upper_outlier
if side == 'lt':
return lower_outlier
if side == 'both':
return np.logical_or(upper_outlier, lower_outlier)
Finally, if you want to filter out the outliers, use a numpy selector.
Have a nice day.
Consider that all the above methods fail when your standard deviation gets very large due to huge outliers.
(Simalar as the average caluclation fails and should rather caluclate the median. Though, the average is "more prone to such an error as the stdDv".)
You could try to iteratively apply your algorithm or you filter using the interquartile range:
(here "factor" relates to a n*sigma range, yet only when your data follows a Gaussian distribution)
import numpy as np
def sortoutOutliers(dataIn,factor):
quant3, quant1 = np.percentile(dataIn, [75 ,25])
iqr = quant3 - quant1
iqrSigma = iqr/1.34896
medData = np.median(dataIn)
dataOut = [ x for x in dataIn if ( (x > medData - factor* iqrSigma) and (x < medData + factor* iqrSigma) ) ]
return(dataOut)
So many answers, but I'm adding a new one that can be useful for the author or even for other users.
You could use the Hampel filter. But you need to work with Series.
Hampel filter returns the Outliers indices, then you can delete them from the Series, and then convert it back to a List.
To use Hampel filter, you can easily install the package with pip:
pip install hampel
Usage:
# Imports
from hampel import hampel
import pandas as pd
list_d = [2, 4, 5, 1, 6, 5, 40]
# List to Series
time_series = pd.Series(list_d)
# Outlier detection with Hampel filter
# Returns the Outlier indices
outlier_indices = hampel(ts = time_series, window_size = 3)
# Drop Outliers indices from Series
filtered_d = time_series.drop(outlier_indices)
filtered_d.values.tolist()
print(f'filtered_d: {filtered_d.values.tolist()}')
And the output will be:
filtered_d: [2, 4, 5, 1, 6, 5]
Where, ts is a pandas Series object and window_size is a total window size will be computed as 2 * window_size + 1.
For this Series I set window_size with the value 3.
The cool thing about working with Series is being able to generate graphics:
# Imports
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
# Plot Original Series
time_series.plot(style = 'k-')
plt.title('Original Series')
plt.show()
# Plot Cleaned Series
filtered_d.plot(style = 'k-')
plt.title('Cleaned Series (Without detected Outliers)')
plt.show()
And the output will be:
To learn more about Hampel filter, I recommend the following readings:
Python implementation of the Hampel Filter
Outlier Detection with Hampel Filter
Clean-up your time series data with a Hampel Filter
if you want to get the index position of the outliers idx_list will return it.
def reject_outliers(data, m = 2.):
d = np.abs(data - np.median(data))
mdev = np.median(d)
s = d/mdev if mdev else 0.
data_range = np.arange(len(data))
idx_list = data_range[s>=m]
return data[s<m], idx_list
data_points = np.array([8, 10, 35, 17, 73, 77])
print(reject_outliers(data_points))
after rejection: [ 8 10 35 17], index positions of outliers: [4 5]
For a set of images (each image has 3 dimensions), where I wanted to reject outliers for each pixel I used:
mean = np.mean(imgs, axis=0)
std = np.std(imgs, axis=0)
mask = np.greater(0.5 * std + 1, np.abs(imgs - mean))
masked = np.multiply(imgs, mask)
Then it is possible to compute the mean:
masked_mean = np.divide(np.sum(masked, axis=0), np.sum(mask, axis=0))
(I use it for Background Subtraction)
Here I find the outliers in x and substitute them with the median of a window of points (win) around them (taking from Benjamin Bannier answer the median deviation)
def outlier_smoother(x, m=3, win=3, plots=False):
''' finds outliers in x, points > m*mdev(x) [mdev:median deviation]
and replaces them with the median of win points around them '''
x_corr = np.copy(x)
d = np.abs(x - np.median(x))
mdev = np.median(d)
idxs_outliers = np.nonzero(d > m*mdev)[0]
for i in idxs_outliers:
if i-win < 0:
x_corr[i] = np.median(np.append(x[0:i], x[i+1:i+win+1]))
elif i+win+1 > len(x):
x_corr[i] = np.median(np.append(x[i-win:i], x[i+1:len(x)]))
else:
x_corr[i] = np.median(np.append(x[i-win:i], x[i+1:i+win+1]))
if plots:
plt.figure('outlier_smoother', clear=True)
plt.plot(x, label='orig.', lw=5)
plt.plot(idxs_outliers, x[idxs_outliers], 'ro', label='outliers')
plt.plot(x_corr, '-o', label='corrected')
plt.legend()
return x_corr
Trim outliers in a numpy array along axis and replace them with min or max values along this axis, whichever is closer. The threshold is z-score:
def np_z_trim(x, threshold=10, axis=0):
""" Replace outliers in numpy ndarray along axis with min or max values
within the threshold along this axis, whichever is closer."""
mean = np.mean(x, axis=axis, keepdims=True)
std = np.std(x, axis=axis, keepdims=True)
masked = np.where(np.abs(x - mean) < threshold * std, x, np.nan)
min = np.nanmin(masked, axis=axis, keepdims=True)
max = np.nanmax(masked, axis=axis, keepdims=True)
repl = np.where(np.abs(x - max) < np.abs(x - min), max, min)
return np.where(np.isnan(masked), repl, masked)
My solution drops the top and bottom percentiles, keeping values that are equal to the boundary:
def remove_percentile_outliers(data, percent_to_drop=0.001):
low, high = data.quantile([percent_to_drop / 2, 1-percent_to_drop / 2])
return data[(data >= low )&(data <= high)]
My solution let the outliers equal to the previous value.
test_data = [2,4,5,1,6,5,40, 3]
def reject_outliers(data, m=2):
mean = np.mean(data)
std = np.std(data)
for i in range(len(data)) :
if np.abs(data[i] -mean) > m*std :
data[i] = data[i-1]
return data
reject_outliers(test_data)
Output:
[2, 4, 5, 1, 6, 5, 5, 3]

Categories

Resources