Squeezing pandas DataFrame to have non-null values and modify column names - python

I have the following sample DataFrame
import numpy as np
import pandas as pd
df = pd.DataFrame({'Tom': [2, np.nan, np.nan],
'Ron': [np.nan, 5, np.nan],
'Jim': [np.nan, np.nan, 6],
'Mat': [7, np.nan, np.nan],},
index=['Min', 'Max', 'Avg'])
that looks like this where each row have only one non-null value
Tom Ron Jim Mat
Min 2.0 NaN NaN 7.0
Max NaN 5.0 NaN NaN
Avg NaN NaN 6.0 NaN
Desired Outcome
For each column, I want to have the non-null value and then append the index of the corresponding non-null value to the name of the column. So the final result should look like this
Tom_Min Ron_Max Jim_Avg Mat_Min
0 2.0 5.0 6.0 7.0
My attempt
Using list comprehensions: Find the non-null value, and append the corresponding index to the column name and then create a new DataFrame
values = [df[col][~pd.isna(df[col])].values[0] for col in df.columns]
# [2.0, 5.0, 6.0, 7.0]
new_cols = [col + '_{}'.format(df[col][~pd.isna(df[col])].index[0]) for col in df.columns]
# ['Tom_Min', 'Ron_Max', 'Jim_Avg', 'Mat_Min']
df_new = pd.DataFrame([values], columns=new_cols)
My question
Is there some in-built functionality in pandas which can do this without using for loops and list comprehensions?

If there is only one non missing value is possible use DataFrame.stack with convert Series to DataFrame and then flatten MultiIndex, for correct order is used DataFrame.swaplevel with DataFrame.reindex:
df = df.stack().to_frame().T.swaplevel(1,0, axis=1).reindex(df.columns, level=0, axis=1)
df.columns = df.columns.map('_'.join)
print (df)
Tom_Min Ron_Max Jim_Avg Mat_Min
0 2.0 5.0 6.0 7.0

Use:
s = df.T.stack()
s.index = s.index.map('_'.join)
df = s.to_frame().T
Result:
# print(df)
Tom_Min Ron_Max Jim_Avg Mat_Min
0 2.0 5.0 6.0 7.0

Related

insert missing rows in df with dictionary values

Hello I have the following dataframe
df = pd.DataFrame(data={'grade_1':['A','B','C'],
'grade_1_count': [19,28,32],
'grade_2': ['pass','fail',np.nan],
'grade_2_count': [39,18, np.nan]})
whereby some grades as missing, and need to be inserted in to the grade_n column according to the values in this dictionary
grade_dict = {'grade_1':['A','B','C','D','E','F'],
'grade_2' : ['pass','fail','not present', 'borderline']}
and the corresponding row value in the _count column should be filled with np.nan
so the expected output is like this
expected_df = pd.DataFrame(data={'grade_1':['A','B','C','D','E','F'],
'grade_1_count': [19,28,32,0,0,0],
'grade_2': ['pass','fail','not preset','borderline', np.nan, np.nan],
'grade_2_count': [39,18,0,0,np.nan,np.nan]})
so far I have this rather inelegant code that creates a column that includes all the correct categories for the grades, but i cannot reinsert it in to the dataframe, or fill the count columns with zeros (where the np.nans just reflect empty cells due to coercing columns with different lengths of rows) I hope that makes sense. any advice would be great. thanks
x=[]
for k, v in grade_dict.items():
out = df[k].reindex(grade_dict[k], axis=0, fill_value=0)
x = pd.concat([out], axis=1)
x[k] = x.index
x = x.reset_index(drop=True)
df[k] = x.fillna(np.nan)
Here is a solution using two consecutive merges:
# set up combinations
from itertools import zip_longest
df2 = pd.DataFrame(list(zip_longest(*grade_dict.values())), columns=grade_dict)
# merge
(df2.merge(df.filter(like='grade_1'),
on='grade_1', how='left')
.merge(df.filter(like='grade_2'),
on='grade_2', how='left')
.sort_index(axis=1)
)
output:
grade_1 grade_1_count grade_2 grade_2_count
0 A 19.0 pass 39.0
1 B 28.0 fail 18.0
2 C 32.0 not present NaN
3 D NaN borderline NaN
4 E NaN None NaN
5 F NaN None NaN
multiple merges:
df2 = pd.DataFrame(list(zip_longest(*grade_dict.values())), columns=grade_dict)
for col in grade_dict:
df2 = df2.merge(df.filter(like=col),
on=col, how='left')
df2
If you only need to merge on grade_1 without updating the non-NaNs of grade_2, you can cast grade_dict into a df and then use combine_first:
print (df.set_index("grade_1").combine_first(pd.DataFrame(grade_dict.values(),
index=grade_dict.keys()).T.set_index("grade_1"))
.fillna({"grade_1_count": 0}).reset_index())
grade_1 grade_1_count grade_2 grade_2_count
0 A 19.0 pass 39.0
1 B 28.0 fail 18.0
2 C 32.0 not present NaN
3 D 0.0 borderline NaN
4 E 0.0 None NaN
5 F 0.0 None NaN

Group latest values in pandas columns for a given id

I have a pandas dataframe containing some metrics for a given date and user.
>>> pd.DataFrame({"user": ['juan','juan','juan','gonzalo'], "date": [1, 2, 3, 1], "var1": [1, 2, None, 1], "var2": [None, 4, 5, 6]})
user date var1 var2
0 juan 1 1.0 NaN
1 juan 2 2.0 4.0
2 juan 3 NaN 5.0
3 gonzalo 1 1.0 6.0
Now, for each user, I want to extract the 2 more recent values for each variable (var1, var2) ignoring NaN unless there aren't enough values to fill the data.
For reference, this should be the resulting dataframe for the data depicted above
user var1_0 var1_1 var2_0 var2_1
juan 2.0 1.0 5.0 4.0
gonzalo 1.0 NaN 6.0 NaN
each "historical" value is added as a new column with a _0 or _1 suffix.
First sorting if necessary by both columns in DataFrame.sort_values with reshape by DataFrame.sort_values and remove missing values, filter top2 rrows per groups by GroupBy.head, then create counter column by GroupBy.cumcount with pivoting in DataFrame.pivot with flatten MultiIndex:
df1 = (df.sort_values(['user','date'])
.melt(id_vars='user', value_vars=['var1','var2'])
.dropna(subset=['value'])
)
df1 = df1.groupby(['user','variable']).head(2)
df1['g'] = df1.groupby(['user','variable']).cumcount(ascending=False)
df1 = df1.pivot(index='user', columns=['variable', 'g'], values='value')
#oldier pandas versions
#df1 = df1.set_index(['user','variable', 'g'])['value'].unstack([1,2])
df1.columns = df1.columns.map(lambda x: f'{x[0]}_{x[1]}')
df1 = df1.reset_index()
print (df1)
user var1_0 var1_1 var2_0 var2_1
0 gonzalo 1.0 NaN 6.0 NaN
1 juan 2.0 1.0 5.0 4.0
You could group by user and aggregate to get the 2 most recent values. That get's almost all the way there - but instead of columns you have a list of elements. If you want to have the actual 2 columns you have to split the newly created list into columns. Full code:
import pandas as pd
import numpy as np
df = pd.DataFrame(
{
"user": ["juan", "juan", "juan", "gonzalo"],
"date": [1, 2, 3, 1],
"var1": [1, 2, None, 1],
"var2": [None, 4, 5, 6],
}
)
# This almost gets you there
df = (
df.sort_values(by="date")
.groupby("user")
.agg({"var1": lambda x: x.dropna().head(2), "var2": lambda x: x.dropna().head(2)})
)
# Split the columns and get the correct column names
df[["var1_0", "var2_0"]] = df[["var1", "var2"]].apply(
lambda row: pd.Series(el[0] if isinstance(el, np.ndarray) else el for el in row),
axis=1,
)
df[["var1_1", "var2_1"]] = df[["var1", "var2"]].apply(
lambda row: pd.Series(el[-1] if isinstance(el, np.ndarray) else None for el in row),
axis=1,
)
print(df)
>>
var1 var2 var1_0 var2_0 var1_1 var2_1
user
gonzalo 1.0 6.0 1.0 6.0 NaN NaN
juan [1.0, 2.0] [4.0, 5.0] 1.0 4.0 2.0 5.0

Python 3 / Pandas | Using Function with IF & ELIF statements to populate a column

Python 3 / Pandas
I am trying to use a function to check the values of various columns in a dataframe, and select only the value from the column that is not NaN.
The data is structured so there is one main column df['C1'] that I want to populate based on the value in one of the next four columns, df['C2'], df['C3'], df['C4'] and df['C5']. When I observe the data, I see that in the rows df['C2'], df['C3'], df['C4'] and df['C5'], every column has a value that is NaN except for one column which has a text value. This is true for all rows in the dataframe. I am trying to write a function that will be applied to the dataframe to find the column which has a text value, and copy that value from the column into df['C1'].
Here is the function I wrote:
def get_component(df):
if ~df['C2'].isna():
return df['C2']
elif ~df['C3'].isna():
return df['C3']
elif ~df['C4'].isna():
return df['C4']
elif ~df['C5'].isna():
return df['C5']
df['C1'] = df.apply(get_component, axis=1)
But I get the following error:
AttributeError: ("'float' object has no attribute 'isna'", 'occurred at index 0')
Any ideas on how to fix this error so I can achieve this objective? Is there another method to achieve the same result?
Thanks for the help!
Nevermind, I figured it just stumbled upon np.where and used the following code to solve the problem:
df['C1'] = np.where(~df['C2'].isna(),df['C2'],
np.where(~df['C3'].isna(),df['C3'],
np.where(~df['C4'].isna(),df['C4'],
np.where(~df['C5'].isna(),df['C5'],None))))
A solution that makes use of pandas' stack method:
import pandas as pd
import numpy as np
# Initialize example dataframe
df = pd.DataFrame({
"C2": [np.nan, 3, np.nan, np.nan, np.nan],
"C3": [5, np.nan, np.nan, np.nan, np.nan],
"C4": [np.nan, np.nan, np.nan, 7, 3],
"C5": [np.nan, np.nan, 2, np.nan, np.nan],
})
df["C1"] = df.stack().to_numpy()
print(df)
# Output:
# C2 C3 C4 C5 C1
# 0 NaN 5.0 NaN NaN 5.0
# 1 3.0 NaN NaN NaN 3.0
# 2 NaN NaN NaN 2.0 2.0
# 3 NaN NaN 7.0 NaN 7.0
# 4 NaN NaN 3.0 NaN 3.0

Combine two columns in a DataFrame pandas

I am having Dataframe which has multiple columns in which some columns are equal (Same key in trailing end eg: column1 = 'a/first', column2 = 'b/first'). I want to merge these two columns. Please help me out to solve the problem.
My Dataframe looks like
name g1/column1 g1/column2 g1/g2/column1 g2/column2
AAAA 10 20 nan nan
AAAA nan nan 30 40
My result will be like as follows
name g1/column1 g1/column2
AAAA 10 20
AAAA 30 40
Thanks in advance
Use:
#create index by all columns with no merge
df = df.set_index('name')
#MultiIndex by split last /
df.columns = df.columns.str.rsplit('/', n=1, expand=True)
#aggregate first no NaN values per second level of MultiIndex
df = df.groupby(level=1, axis=1).first()
print (df)
column1 column2
name
AAAA 10.0 20.0
AAAA 30.0 40.0
you need df.combine_first,
col1=['g1/column1', 'g1/column2']
col2=['g1/g2/column1', 'g2/column2']
df[col1]=df[col1].combine_first(pd.DataFrame(df[col2].values,columns=col1))
df=df.drop(col2,axis=1)
print(df)
# name g1/column1 g1/column2
#0 AAAA 10.0 20.0
#1 AAAA 30.0 40.0
One of the solution:
df = pd.DataFrame([[10, 20, np.nan, np.nan],
[np.nan, np.nan, 30, 40]],
columns=['g1/column1', 'g1/column2', 'g1/g2/column1', 'g2/column2'])
df
g1/column1 g1/column2 g1/g2/column1 g2/column2
0 10.0 20.0 NaN NaN
1 NaN NaN 30.0 40.0
df = df.fillna(0) # <- replacing all NaN with 0
ndf = pd.DataFrame()
unique_cols = ['column1', 'column2']
for i in range(len(unique_cols)):
val = df.columns[df.columns.str.contains(unique_cols[i])]
ndf[val[0]] = df.loc[:,val].sum().reset_index(drop=True)
ndf # <- You can add index if you need (AAAA, AAAA)
g1/column1 g1/column2
0 10.0 20.0
1 30.0 40.0
import pandas as pd
import numpy as np
g1 = [20, np.nan, 30, np.nan]
g1_2 = [10, np.nan, 20, np.nan]
g2 = [np.nan, 30, np.nan, 40]
g2_2 = [np.nan, 10, np.nan, 30]
dataList = list(zip(g1, g1_2, g2, g2_2))
df = pd.DataFrame(data = dataList, columns=['g1/column1', 'g1/column2', 'g1/g2/column1', 'g2/column2'])
df.fillna(0, inplace=True)
df['g1Combined'] = df['g1/column1'] + df['g1/g2/column1']
df['g2Combined'] = df['g1/column2'] + df['g2/column2']
df.drop('g1/column1', axis=1, inplace=True)
df.drop('g1/column2', axis=1, inplace=True)
df.drop('g1/g2/column1', axis=1, inplace=True)
df.drop('g2/column2', axis=1, inplace=True)
df

Python: Fill 'na' in pandas column with random elements from a list

I am trying to fill 'NA' in a pandas column by randomly selecting elements from a list.
For example:
import pandas as pd
df = pandas.DataFrame()
df['A'] = [1, 2, None, 5, 53, None]
fill_list = [22, 56, 84]
Is it possible to write a function which takes the pandas DF with column name as input and replaces all NA by randomly selecting elements from the list 'fill_list'?
fun(df['column_name'], fill_list])
Create new Series with numpy.random.choice and then replace NaNs by fillna or combine_first:
df['A'] = df['A'].fillna(pd.Series(np.random.choice(fill_list, size=len(df.index))))
#alternative
#df['A'] = df['A'].combine_first(pd.Series(np.random.choice(fill_list, size=len(df.index))))
print (df)
A
0 1.0
1 2.0
2 84.0
3 5.0
4 53.0
5 56.0
Or:
#get mask of NaNs
m = df['A'].isnull()
#count rows with NaNs
l = m.sum()
#create array with size l
s = np.random.choice(fill_list, size=l)
#set NaNs values
df.loc[m, 'A'] = s
print (df)
A
0 1.0
1 2.0
2 56.0
3 5.0
4 53.0
5 56.0
data_rnr['CO BORROWER NAME'].fillna("NO",inplace=True)
data_rnr['ET REASON'].fillna("ET REASON NOT AVAILABLE",inplace=True)
data_rnr['INSURANCE COMPANY NM'].fillna("INSURANCE COMPANY-NOT
AVAILABLE",inplace=True)
data_rnr['GENDER'].fillna("GENDER DATA- NOT AVAILABLE",inplace=True)

Categories

Resources