How to flag an outlier(s) /anomaly in selected columns in python?

How to flag an outlier(s) /anomaly in selected columns in python? - python

In the dataset df below. I want to flag the anomalies in all columns except A, B,C and L.
Any value less than 1500 or greater than 400000 is regarded as an anomaly.
import pandas as pd
# intialise data of lists
data = {
'A':['T1', 'T2', 'T3', 'T4', 'T5'],
'B':[1,1,1,1,1],
'C':[1,2,3,5,9],
'D':[12005, 18190, 1034, 15310, 31117],
'E':[11021, 19112, 19021, 12, 24509 ],
'F':[10022,19910, 19113,19999, 25519],
'G':[14029, 29100, 39022, 24509, 412262],
'H':[52119,32991,52883,69359,57835],
'J':[41218, 52991,55121,69152,79355],
'K': [43211,8199991,56881,212,77342],
'L': [1,0,1,0,0],
'M': [31211,42901,53818,62158,69325],
}
# Create DataFrame
df = pd.DataFrame(data)
# Print the output.
df
Attempt:
exclude_cols = ['A','B','C','L']
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
return '' # or None, or whatever df.style() needs
else:
s = pd.to_numeric(s, errors='coerce')
indexes = (s<1500)|(s>400000)
return ['background-color: red' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1)
Result of the code:
Desired output should look like this:
Thanks for the effort!

If you set the subset as the argument of the apply function, you will get what you want.
exclude_cols = ['A','B','C','L']
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
print(s.name)
return '' # or None, or whatever df.style() needs
else:
s = pd.to_numeric(s, errors='coerce')
indexes = (s<1500)|(s>400000)
return ['background-color: yellow' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1, subset=['D','E','F','G','H','J','K'])

Related

How to style pandas dataframe using for loop

I have a dataset where I need to display different values with different colors. Not all the cells in the data are highlighted and only some of the data is highlighted.
Here are some of the colors:
dict_colors = {'a': 'red', 'b': 'blue','e':'tomato'}
How can I highlight all these cells with given colors?
MWE
# data
import pandas as pd
df = pd.DataFrame({'A': list('abcdef'), 'B': list('aabbcc'), 'C': list('aaabbb')})
# without for loop
(df.style
.apply(lambda dfx: ['background: red' if val == 'a' else '' for val in dfx], axis = 1)
.apply(lambda dfx: ['background: blue' if val == 'b' else '' for val in dfx], axis = 1)
)
# How to do this using for loop (I have so many values and different colors for them)
# My attempt
dict_colors = {'a': 'red', 'b': 'blue','e':'tomato'}
s = df.style
for key,color in dict_colors.items():
s = s.apply(lambda dfx: [f'background: {color}' if cell == key else '' for cell in dfx], axis = 1)
display(s)

You can try that:
import pandas as pd
df = pd.DataFrame({'A': list('abcdef'), 'B': list('aabbcc'), 'C': list('aaabbb')})
dict_colors = {'a': 'red', 'b': 'blue', 'e':'tomato'}
# create a Styler object for the DataFrame
s = df.style
def apply_color(val):
if val in dict_colors:
return f'background: {dict_colors[val]}'
return ''
# apply the style to each cell
s = s.applymap(apply_color)
# display the styled DataFrame
display(s)

I found a way using eval method, it is not the most elegant method but it works.
import pandas as pd
df = pd.DataFrame({'A': list('abcdef'), 'B': list('aabbcc'), 'C': list('aaabbb')})
dict_colors = {'a': 'red', 'b': 'blue','e':'tomato'}
lst = [ 'df.style']
for key,color in dict_colors.items():
text = f".apply(lambda dfx: ['background: {color}' if cell == '{key}' else '' for cell in dfx], axis = 1)"
lst.append(text)
s = ''.join(lst)
display(eval(s))

Feature engineering, ValueError: Columns must be same length as key

I'm running into a ValueError: Columns must be same length as key when trying to do encoding for the column Type. Here are the codes, not sure which part is wrong.
df.head()
plt.figure(figsize=(7, 5))
sns.heatmap(df.isnull(), cmap='viridis')
df.isnull().any()
df.isnull().sum()
df['Rating'] = df['Rating'].fillna(df['Rating'].median())
replaces = [u'\u00AE', u'\u2013', u'\u00C3', u'\u00E3', u'\u00B3', '[', ']', "'"]
for i in replaces:
df['Current Ver'] = df['Current Ver'].astype(str).apply(lambda x : x.replace(i, ''))
regex = [r'[-+|/:/;(_)#]', r'\s+', r'[A-Za-z]+']
for j in regex:
df['Current Ver'] = df['Current Ver'].astype(str).apply(lambda x : re.sub(j, '0', x))
df['Current Ver'] = df['Current Ver'].astype(str).apply(lambda x : x.replace('.', ',',1).replace('.', '').replace(',', '.',1)).astype(float)
df['Current Ver'] = df['Current Ver'].fillna(df['Current Ver'].median())
i = df[df['Category'] == '1.9'].index
df.loc[i]
df = df.drop(i)
df = df[pd.notnull(df['Last Updated'])]
df = df[pd.notnull(df['Content Rating'])]
le = preprocessing.LabelEncoder()
df['App'] = le.fit_transform(df['App'])
category_list = df['Category'].unique().tolist()
category_list = ['cat_' + word for word in category_list]
df = pd.concat([df, pd.get_dummies(df['Category'], prefix='cat')], axis=1)
le = preprocessing.LabelEncoder()
df['Genres'] = le.fit_transform(df['Genres'])
le = preprocessing.LabelEncoder()
df['Content Rating'] = le.fit_transform(df['Content Rating'])
df['Price'] = df['Price'].apply(lambda x : x.strip('$'))
df['Installs'] = df['Installs'].apply(lambda x : x.strip('+').replace(',', ''))
df['Type'] = pd.get_dummies(df['Type'])

You are trying to map a DataFrame with multiple columns to one column to the original DataFrame.
pd.get_dummies returns a DataFrame with a column for each value in the column.
If you want to add those values to the original DataFrame you can use concat.
Example:
import pandas as pd
df = pd.DataFrame(data=['type1', 'type2', 'type3'], columns=['Type'])
dummies_df = pd.get_dummies(df['Type'])
pd.concat([df, dummies_df], axis=1)

Dictionary creation inside a function

Let's say I have the following dataframe:
import pandas as pd
data = {'Flag':['a', 'b', 'a', 'b'],
'Item':['ball', 'car', 'pen', 'candy'],
'Char1':[0, 0, 0, 0],
'Char2':[23, 21, 19, 13],
'Char3':[40, 43, 60, 70]}
df = pd.DataFrame(data)
Now, let's perform some calculation:
df['Char1_avg'] = df.apply(lambda x: df[df.Flag == x.Flag].Char1.mean(), axis=1)
df['Char1_std'] = df.apply(lambda x: df[df.Flag == x.Flag].Char1.std(), axis=1)
df['Char2_avg'] = df.apply(lambda x: df[df.Flag == x.Flag].Char2.mean(), axis=1)
df['Char2_std'] = df.apply(lambda x: df[df.Flag == x.Flag].Char2.std(), axis=1)
df['Char3_avg'] = df.apply(lambda x: df[df.Flag == x.Flag].Char3.mean(), axis=1)
df['Char3_std'] = df.apply(lambda x: df[df.Flag == x.Flag].Char3.std(), axis=1)
Finally let's create the following dictionary:
Flag_list = ['a','b']
sum_dict = {'Flag':Flag_list,
'Char1_average':df['Char1_avg'].head(2).tolist(),
'Char1_std':df['Char1_std'].head(2).tolist(),
'Char2_average':df['Char2_avg'].head(2).tolist(),
'Char2_std':df['Char2_std'].head(2).tolist(),
'Char3_average':df['Char3_avg'].head(2).tolist(),
'Char3_std':df['Char3_std'].head(2).tolist()}
In this way all works fine,
correct dictionary
but I need to define a function that performs the same things, so I have written the following code:
def fnctn(dataf):
param_list=["Char1", "Char2", 'Char3']
for param in param_list:
dataf[f'{param}_avg'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].mean(), axis=1)
dataf[f'{param}_StDev'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].std(), axis=1)
sum_dict = {'Flag':Flag_list,
f'{param}_average':dref[f'{param}_avg'].head(2).tolist(),
f'{param}_std':dref[f'{param}_StDev'].head(2).tolist()}
ref_avg_values = pd.DataFrame(sum_dict)
dataf = df.copy()
fnctn(dataf)
But this time the dictionary I get contains only the values of the last iteration:
wrong dictionary
How can I get the same dictionary as in the previous case?

you have to update it into the dictionary so that you have all the values that are iterated inside the for loop.
Here is the solution to your query:
def fnctn(dataf):
param_list=["Char1", "Char2", 'Char3']
dictie={}
for param in param_list:
dataf[f'{param}_avg'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].mean(), axis=1)
dataf[f'{param}_StDev'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].std(), axis=1)
sum_dict = {'Flag':Flag_list,
f'{param}_average':dataf[f'{param}_avg'].head(2).tolist(),
f'{param}_std':dataf[f'{param}_StDev'].head(2).tolist()}
dictie.update(sum_dict)
return pd.DataFrame(dictie)
dataf = df.copy()
fnctn(dataf)
And the answer is as below:

Why does the export of a styled pandas dataframe to Excel not work?

I would like to apply the same background color to cells that have for each PEOPLE instance the name and the related name. I have tried to df.style.applymap, it does not return an error but it does not seem to work. Anyone has any ideas why? Thank you.
clrs = list(mcolors.CSS4_COLORS.keys())
for k in range(len(PEOPLE)):
if PEOPLE[k].attribute == 'child':
df1_data = [PEOPLE[k].name, PEOPLE[k].related]
df.style.applymap([lambda x: 'background-color: yellow' if x in df1_data else 'background-color: red'])
df.to_excel('styledz.xlsx', engine='openpyxl')

Here is some more info on df.style. Here I'm using some simple example because I don't have your data available:
import pandas as pd
import numpy as np
df = pd.DataFrame({'a': np.random.randint(0, 10, 10), 'b': np.random.randint(0, 10, 10), 'late': np.random.choice([0, 1], 10).astype(np.bool)})
def highlight_late(s):
return ['background-color: red' if s['late'] else 'background-color: green' for s_ in s]
df = df.style.apply(highlight_late, axis=1)
df.to_excel('style.xlsx', engine='openpyxl')
Looks in the excel file like this:
For cell based coloring use:
def highlight_late(s):
return ['background-color: red' if s_ else 'background-color: green' for s_ in s]
df = df.style.apply(highlight_late, subset=["late"], axis=1)
This gives you:

Basically your solution will be a modification of the following:
df = DataFrame([['mark', 2], ['mike', 4], ['manny', 6]], columns=['name', 'attribute'])
def style_row(row, people):
output = Series("", index=row.index)
if row["name"] in people:
output['attribute'] = "background-color:red;"
return output
styler = df.style.apply(style_row, axis=1, people=['mark', 'manny'])
styler

searching in a pandas df that contains ranges

I have a pandas df that contains 2 columns 'start' and 'end' (both are integers). I would like an efficient method to search for rows such that the range that is represented by the row [start,end] contains a specific value.
Two additional notes:
It is possible to assume that ranges don't overlap
The solution should support a batch mode - that given a list of inputs, the output will be a mapping (dictionary or whatever) to the row indices that contain the matching range.
For example:
start end
0 7216 7342
1 7343 7343
2 7344 7471
3 7472 8239
4 8240 8495
and the query of
[7215,7217,7344]
will result in
{7217: 0, 7344: 2}
Thanks!

Brute force solution, could use lots of improvements though.
df = pd.DataFrame({'start': [7216, 7343, 7344, 7472, 8240],
'end': [7342, 7343, 7471, 8239, 8495]})
search = [7215, 7217, 7344]
res = {}
for i in search:
mask = (df.start <= i) & (df.end >= i)
idx = df[mask].index.values
if len(idx):
res[i] = idx[0]
print res
Yields
{7344: 2, 7217: 0}

Selected solution
This new solution could have better performances. But there is a limitation, it will only works if there is no gap between ranges like in the example provided.
# Test data
df = pd.DataFrame({'start': [7216, 7343, 7344, 7472, 8240],
'end': [7342, 7343, 7471, 8239, 8495]}, columns=['start','end'])
query = [7215,7217,7344]
# Reshaping the original DataFrame
df = df.reset_index()
df = pd.concat([df['start'], df['end']]).reset_index()
df = df.set_index(0).sort_index()
# Creating a DataFrame with a continuous index
max_range = max(df.index) + 1
min_range = min(df.index)
s = pd.DataFrame(index=range(min_range,max_range))
# Joining them
s = s.join(df)
# Filling the gaps
s = s.fillna(method='backfill')
# Then a simple selection gives the result
s.loc[query,:].dropna().to_dict()['index']
# Result
{7217: 0.0, 7344: 2.0}
Previous proposal
# Test data
df = pd.DataFrame({'start': [7216, 7343, 7344, 7472, 8240],
'end': [7342, 7343, 7471, 8239, 8495]}, columns=['start','end'])
# Constructing a DataFrame containing the query numbers
query = [7215,7217,7344]
result = pd.DataFrame(np.tile(query, (len(df), 1)), columns=query)
# Merging the data and the query
df = pd.concat([df, result], axis=1)
# Making the test
df = df.apply(lambda x: (x >= x['start']) & (x <= x['end']), axis=1).loc[:,query]
# Keeping only values found
df = df[df==True]
df = df.dropna(how='all', axis=(0,1))
# Extracting to the output format
result = df.to_dict('split')
result = dict(zip(result['columns'], result['index']))
# The result
{7217: 0, 7344: 2}

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to flag an outlier(s) /anomaly in selected columns in python? - python

Related

How to style pandas dataframe using for loop

Feature engineering, ValueError: Columns must be same length as key

Dictionary creation inside a function

Why does the export of a styled pandas dataframe to Excel not work?

searching in a pandas df that contains ranges

Categories

Resources