How to properly write if-then lambda statement for pandas df? - python

I have the following code:
data = [[11001218, 'Value', 93483.37, 'G', '', 93483.37, '', '56117J100', 'FRA', 'Equity'],
[11001218, 'Value', 3572.73, 'G', 3572.73, '', '56117J100', '', 'LUM', 'Equity'],
[11001218, 'Value', 89910.64, 'G', 89910.64, '', '56117J100', '', 'WAR', 'Equity'],
[11005597, 'Value', 72640313.34,'L','',72640313.34, 'REVR21964', '','IN2', 'Repo']
]
df = pd.DataFrame(data, columns = ['ID', 'Type', 'Diff', 'Group', 'Amount','Amount2', 'Id2', 'Id3', 'Executor', 'Name'])
def logic_builder(row, row2, row3):
if row['Name'] == 'Repo' and row['Group'] == 'L':
return 'Fine resultant'
elif (row['ID'] == row2['ID']) and (row['ID'] == row3['ID']) and (row['Group'] == row2['Group']) and (row['Group'] == row3['Group']) and (row['Executor'] != row2['Executor']) and (row['Executor'] != row3['Executor']):
return 'Difference in Executor'
df['Results'] = df.apply(lambda row: logic_builder(row, row2, row3), axis=1)
If you look at the first 3 rows, they are all technically the same. They contain the same ID, Type, Group, and Name. The only difference is the executor, hence I would like my if-then statement to return "Difference in Executor". I am having trouble figuring out how to right the if-then to look at all the rows with similar attributes for the fields I mentioned above.
Thank you.

You can pass a single row, then determine its index and look for the other rows with df.iloc[index].
Here an example
def logic_builder(row):
global df #you need to access the df
i = row.name #row index
#get next rows
try:
row2 = df.iloc[i+1]
row3 = df.iloc[i+2]
except IndexError:
return
if row['Name'] == 'Repo' and row['Group'] == 'L':
return 'Fine resultant'
elif (row['ID'] == row2['ID']) and (row['ID'] == row3['ID']) and (row['Group'] == row2['Group']) and (row['Group'] == row3['Group']) and (row['Executor'] != row2['Executor']) and (row['Executor'] != row3['Executor']):
return 'Difference in Executor'
df['Results'] = df.apply(logic_builder, axis=1)
Of course, since the result depend on the next two rows, you can't run it on the last 2 rows of the dataframe.

You can modify the function a bit to perform on a chunk/slice of a dataframe, based on group using groupby since you are performing the action per group. A modified version of the function you have written would look something like this:
def logic_builder(group):
if group['Name'].eq('Repo').all() and group['Group'].eq('L').all():
return 'Fine resultant'
elif group['Group'].nunique()==1 and group['Executor'].nunique()>1:
return 'Difference in Executor'
row1, row2, row3,..,rown is not going to work actually, because there may be one or more rows per group, so better strategy is to perform if else using all, and nunique (which essentially gives number of unique values in the selected column) for the above logic that you have.
Then apply the function on groupby object:
df.groupby('ID').apply(logic_builder)
ID
11001218 Difference in Executor
11005597 Fine resultant
dtype: object
You can finally join above value to the actual dataframe if needed.

Related

How to flag an outlier(s) /anomaly in selected columns in python?

In the dataset df below. I want to flag the anomalies in all columns except A, B,C and L.
Any value less than 1500 or greater than 400000 is regarded as an anomaly.
import pandas as pd
# intialise data of lists
data = {
'A':['T1', 'T2', 'T3', 'T4', 'T5'],
'B':[1,1,1,1,1],
'C':[1,2,3,5,9],
'D':[12005, 18190, 1034, 15310, 31117],
'E':[11021, 19112, 19021, 12, 24509 ],
'F':[10022,19910, 19113,19999, 25519],
'G':[14029, 29100, 39022, 24509, 412262],
'H':[52119,32991,52883,69359,57835],
'J':[41218, 52991,55121,69152,79355],
'K': [43211,8199991,56881,212,77342],
'L': [1,0,1,0,0],
'M': [31211,42901,53818,62158,69325],
}
# Create DataFrame
df = pd.DataFrame(data)
# Print the output.
df
Attempt:
exclude_cols = ['A','B','C','L']
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
return '' # or None, or whatever df.style() needs
else:
s = pd.to_numeric(s, errors='coerce')
indexes = (s<1500)|(s>400000)
return ['background-color: red' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1)
Result of the code:
Desired output should look like this:
Thanks for the effort!
If you set the subset as the argument of the apply function, you will get what you want.
exclude_cols = ['A','B','C','L']
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
print(s.name)
return '' # or None, or whatever df.style() needs
else:
s = pd.to_numeric(s, errors='coerce')
indexes = (s<1500)|(s>400000)
return ['background-color: yellow' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1, subset=['D','E','F','G','H','J','K'])

Select rows from a DataFrame using .loc and multiple conditions and then show the row corresponding to the min/max of one column

I know how to select data using .loc and multiple conditions, like so:
df.loc[(df['A'] == True)&(df['B'] == 'Tuesday')]
But from the result of this I can't figure out how to show the entire row corresponding to the min (or max) taken on one other column of numbers, 'C'. How do I do this?
Use this:
df2 = df.loc[(df['A'] == True)&(df['B'] == 'Tuesday')]
df2.loc[df2.C == df2.C.min(), :]
Use this:
for columns:
df.loc[(df['A'] == True)&(df['B'] == 'Tuesday')].apply(max, axis=0)
for rows:
df.loc[(df['A'] == True)&(df['B'] == 'Tuesday')].apply(max, axis=1)
You can use the idxmin or idxmax functions.
Docs for the idxmin function: "Return the row label of the minimum value. If multiple values equal the minimum, the first row label with that value is returned."
So, if you
df.loc[((df['A'] == True) & (df['B'] == 'Tuesday')).idxmix()], this will return the row which has the minimum value for column C.
The easiest option:
df = pd.DataFrame({
'A': [True,False,True,True],
'B': ['Sun', 'Mon', 'Tue', 'Tue'],
'C': [1,4,5,1],
'D': [10,20,30,40]})
print(df.query(""" A == True and B == 'Tue' and C == C.min() """))
A B C D
3 True Tue 1 40

pyspark sql functions instead of rdd distinct

I have been attempting to replace strings in a data set for specific columns. Either with 1 or 0, 'Y' if 1, otherwise 0.
I have managed to identify which columns to target, using a dataframe to rdd conversion with a lambda, but it is taking a while to process.
A switch to an rdd is done for each column and then a distinct is performed, this is taking a while!
If a 'Y' exists in the distinct result set then the column is identified as requiring a transformation.
I was wondering if anyone can suggest how I can use pyspark sql functions exclusively to obtain the same result instead of having to switch for each column?
The code, on sample data, is as follows:
import pyspark.sql.types as typ
import pyspark.sql.functions as func
col_names = [
('ALIVE', typ.StringType()),
('AGE', typ.IntegerType()),
('CAGE', typ.IntegerType()),
('CNT1', typ.IntegerType()),
('CNT2', typ.IntegerType()),
('CNT3', typ.IntegerType()),
('HE', typ.IntegerType()),
('WE', typ.IntegerType()),
('WG', typ.IntegerType()),
('DBP', typ.StringType()),
('DBG', typ.StringType()),
('HT1', typ.StringType()),
('HT2', typ.StringType()),
('PREV', typ.StringType())
]
schema = typ.StructType([typ.StructField(c[0], c[1], False) for c in col_names])
df = spark.createDataFrame([('Y',22,56,4,3,65,180,198,18,'N','Y','N','N','N'),
('N',38,79,3,4,63,155,167,12,'N','N','N','Y','N'),
('Y',39,81,6,6,60,128,152,24,'N','N','N','N','Y')]
,schema=schema)
cols = [(col.name, col.dataType) for col in df.schema]
transform_cols = []
for s in cols:
if s[1] == typ.StringType():
distinct_result = df.select(s[0]).distinct().rdd.map(lambda row: row[0]).collect()
if 'Y' in distinct_result:
transform_cols.append(s[0])
print(transform_cols)
The output is :
['ALIVE', 'DBG', 'HT2', 'PREV']
I managed to use udf in order to do the task. First, pick the column with Y or N (here I use func.first in order to skim through the first row):
cols_sel = df.select([func.first(col).alias(col) for col in df.columns]).collect()[0].asDict()
cols = [col_name for (col_name, v) in cols_sel.items() if v in ['Y', 'N']]
# return ['HT2', 'ALIVE', 'DBP', 'HT1', 'PREV', 'DBG']
Next, You can create udf function in order to map Y, N to 1, 0.
def map_input(val):
map_dict = dict(zip(['Y', 'N'], [1, 0]))
return map_dict.get(val)
udf_map_input = func.udf(map_input, returnType=typ.IntegerType())
for col in cols:
df = df.withColumn(col, udf_map_input(col))
df.show()
Finally, you can sum the column. I then transform output into dictionary and check which columns has value greater than 0 (i.e. contains Y)
out = df.select([func.sum(col).alias(col) for col in cols]).collect()
out = out[0]
print([col_name for (col_name, val) in out.asDict().items() if val > 0])
Output
['DBG', 'HT2', 'ALIVE', 'PREV']

Masking a DataFrame on multiple column conditions - inside a loop

I would like to mask my dataframe conditional on multiple columns inside a loop. I am trying to do something like this:
dfs = []
val_dict = {0: 'a', 1: 'b', 2: 'c', 3: 'd'}
for i in range(4):
items = [val_dict[i] for i in range(i+1)]
df_ = df[(df['0'] == items[0]) & (df['1'] == items[1]) & ... ]
dfs.append(df_)
Please note that the second condition I wrote above would not exist for the first iteration of the loop because there would be no items[1] element.
Here is a sample dataframe you are welcome to test on:
df = pd.DataFrame({'0': ['a']*3 + ['b']*3 + ['c']*3,
'1': ['a']*3 + ['b']*6,
'2': ['b']*4 + ['c']*5,
'3': ['c']*5 + ['d']*4})
The only solution I have come up with uses eval which I would like very much to avoid.
If you subset your DataFrame to include only the columns you want to use for comparison (as you have done in your example) and the keys in your val_dict are the same as the columns you want to compare, then you can get Pandas to do this for you.
Making a slight modification to your df
df = pd.DataFrame({0: ['a']*3 + ['b']*3 + ['c']*3,
1: ['a']*3 + ['d']*6,
2: ['b']*4 + ['c']*5,
3: ['c']*5 + ['a']*4})
You can now accomplish what you want by the following
dfs = []
val_dict = {0: 'a', 1: 'b', 2: 'c', 3: 'd'}
val_series = pd.Series(val_dict)
for i in range(4):
mask = (df == val_series).all(axis=1)
dfs.append(df[mask])
EDIT
I am leaving my original solution even though it addresses a different problem than OP intended to solve. The intended problem can be solved by the following:
mask = True
for key in range(4):
mask &= df[key] == val_dict[key]
dfs.append(df[mask])
Again, this is using the modified df used earlier in my original answer.
I'll share my eval solution.
for i in range(4):
items = [val_dict[i] for i in range(i+1)]
df_ = eval('df[(' + ') & ('.join(['df["'+str(j)+'"] == items['+str(j)+']' for j in range(i+1)]) + ')]')
dfs.append(df_)
It works... but so ugly :(

searching in a pandas df that contains ranges

I have a pandas df that contains 2 columns 'start' and 'end' (both are integers). I would like an efficient method to search for rows such that the range that is represented by the row [start,end] contains a specific value.
Two additional notes:
It is possible to assume that ranges don't overlap
The solution should support a batch mode - that given a list of inputs, the output will be a mapping (dictionary or whatever) to the row indices that contain the matching range.
For example:
start end
0 7216 7342
1 7343 7343
2 7344 7471
3 7472 8239
4 8240 8495
and the query of
[7215,7217,7344]
will result in
{7217: 0, 7344: 2}
Thanks!
Brute force solution, could use lots of improvements though.
df = pd.DataFrame({'start': [7216, 7343, 7344, 7472, 8240],
'end': [7342, 7343, 7471, 8239, 8495]})
search = [7215, 7217, 7344]
res = {}
for i in search:
mask = (df.start <= i) & (df.end >= i)
idx = df[mask].index.values
if len(idx):
res[i] = idx[0]
print res
Yields
{7344: 2, 7217: 0}
Selected solution
This new solution could have better performances. But there is a limitation, it will only works if there is no gap between ranges like in the example provided.
# Test data
df = pd.DataFrame({'start': [7216, 7343, 7344, 7472, 8240],
'end': [7342, 7343, 7471, 8239, 8495]}, columns=['start','end'])
query = [7215,7217,7344]
# Reshaping the original DataFrame
df = df.reset_index()
df = pd.concat([df['start'], df['end']]).reset_index()
df = df.set_index(0).sort_index()
# Creating a DataFrame with a continuous index
max_range = max(df.index) + 1
min_range = min(df.index)
s = pd.DataFrame(index=range(min_range,max_range))
# Joining them
s = s.join(df)
# Filling the gaps
s = s.fillna(method='backfill')
# Then a simple selection gives the result
s.loc[query,:].dropna().to_dict()['index']
# Result
{7217: 0.0, 7344: 2.0}
Previous proposal
# Test data
df = pd.DataFrame({'start': [7216, 7343, 7344, 7472, 8240],
'end': [7342, 7343, 7471, 8239, 8495]}, columns=['start','end'])
# Constructing a DataFrame containing the query numbers
query = [7215,7217,7344]
result = pd.DataFrame(np.tile(query, (len(df), 1)), columns=query)
# Merging the data and the query
df = pd.concat([df, result], axis=1)
# Making the test
df = df.apply(lambda x: (x >= x['start']) & (x <= x['end']), axis=1).loc[:,query]
# Keeping only values found
df = df[df==True]
df = df.dropna(how='all', axis=(0,1))
# Extracting to the output format
result = df.to_dict('split')
result = dict(zip(result['columns'], result['index']))
# The result
{7217: 0, 7344: 2}

Categories

Resources