Related
I have a dataframe df (see image below) which I need to merge with N dataframes.
In this post, for the sake of clarity, N=3.
The goal is to check if every value of the column Id exists in the three other dataframes and have the same value as well. And if so, the row has to be highlighted with a green color. That's it !
Code :
import pandas as pd
import numpy as np
### --- Dataframes
df = pd.DataFrame({'Id' : ['AA', 'BB', 'CC', 'DD', 'EE'],
'Value': ['three', 'two', 'five','four', 'one']})
df1 = pd.DataFrame({'Id1' : [np.nan, 'CC', 'BB', 'DD', np.nan],
'Value1' : ['one', 'four', 'two', np.nan, np.nan]})
df2 = pd.DataFrame({'Id2' : ['AA', 'BB', 'CC', 'DD', 'JJ'],
'Value2' : [np.nan, 'two', 'five', np.nan, 'six']})
df3 = pd.DataFrame({'Id3' : ['FF', 'HH', 'CC', 'GG', 'BB'],
'Value3' : ['seven', 'five', 'one','three', 'two']})
### --- Joining df to df1, df2 and df3
df_df1 = df.merge(df1, left_on='Id', right_on='Id1', how='left')
df_df1_df2 = df_df1.merge(df2, left_on='Id', right_on='Id2', how='left')
df_df1_df2_df3 = df_df1_df2.merge(df3, left_on='Id', right_on='Id3', how='left')
### --- Creating a function to highlight the aligned rows
def highlight_aligned_row(x):
m1 = (x['Id'] == x['Id1']) & (x['Id'] == x['Id2']) & (x['Id'] == x['Id3'])
m2 = (x['Value'] == x['Value1']) & (x['Value']== x['Value2']) & (x['Value'] == x['Value3'])
df = pd.DataFrame('background-color: ', index=x.index, columns=x.columns)
df['Id'] = np.where(m1 & m2, f'background-color: green', df['Id'])
return df
>>> df_df1_df2_df3.style.apply(highlight_aligned_row, axis=None)
My questions are :
How do we highlight the entire row when a condition is fulfilled ?
Is there a more efficient way to merge 10 dataframes ?
How can we check if every value/row of the original dataframe is aligned with the values of the final dataframe (after the merge) ?
Thank you in advance for your suggestions and your help !
I would do it like this. Hope the comments in between make clear what I am doing. Hopefully, they also answer your questions, but let me know if anything remains unclear.
import pandas as pd
import numpy as np
df = pd.DataFrame({'Id' : ['AA', 'BB', 'CC', 'DD', 'EE'],
'Value': ['three', 'two', 'five','four', 'one']})
df1 = pd.DataFrame({'Id1' : [np.nan, 'BB', 'CC', 'DD', np.nan],
'Value1' : ['one', 'two', 'four', np.nan, np.nan]})
df2 = pd.DataFrame({'Id2' : ['AA', 'BB', 'CC', 'DD', 'JJ'],
'Value2' : [np.nan, 'two', 'five', np.nan, 'six']})
df3 = pd.DataFrame({'Id3' : ['FF', 'BB', 'CC', 'GG', 'HH'],
'Value3' : ['seven', 'two', 'one','v4', 'v5']})
# *IF* your dfs (like above) are all same shape with same index, then easiest is to
# collect your dfs in a list and use pd.concat along axis 1 to merge:
# dfs = [df, df1, df2, df3]
# df_all = pd.concat(dfs, axis=1)
# *BUT* from your comments, this does not appear to be the case. Then instead,
# again collect dfs in a list, and merge them with df in a loop
dfs = [df, df1, df2, df3]
for idx, list_df in enumerate(dfs):
if idx == 0:
df_all = list_df
else:
df_all = df_all.merge(list_df, left_on='Id',
right_on=[col for col in list_df.columns
if col.startswith('Id')][0],
how='left')
def highlight_aligned_row(x):
n1 = x.loc[:,[col for col in x.columns
if col.startswith('Id')]].eq(x.loc[:, 'Id'], axis=0).all(axis=1)
m1 = x.loc[:,[col for col in x.columns
if col.startswith('Value')]].eq(x.loc[:, 'Value'], axis=0).all(axis=1)
eval_bool = n1 & m1
# Just for x['Id']: [False, True, False, False, False]
# repeat 8 times (== len(df.columns)) will lead to .shape == (40,).
# reshape to 5 rows (== len(df)) and 8 cols. Second row will be [8x True] now,
# other rows all 8x False
rows, cols = len(eval_bool), len(x.columns) # 5, 8
eval_bool_repeated = eval_bool.to_numpy().repeat(cols).reshape(rows,cols)
# setup your df
df = pd.DataFrame('background-color: ', index=x.index, columns=x.columns)
# now apply eval_bool_repeated to entire df, not just df['Id']
df = np.where(eval_bool_repeated, f'background-color: green', df)
return df
Result:
This is my df:
df = pd.DataFrame({'sym': ['a', 'b', 'c', 'x', 'y', 'z', 'q', 'w', 'e'],
'sym_t': ['tsla', 'msft', 'f', 'aapl', 'aa', 'gg', 'amd', 'ba', 'c']})
I want to separate this df into groups of three and create a list of dictionaries:
options = [{'value':'a b c', 'label':'tsla msft f'}, {'value':'x y z', 'label':'aapl aa gg'}, {'value':'q w e', 'label':'amd ba c'}]
How can I create that list? My original df has over 1000 rows.
Try groupby to concatenate the rows, then to_dict:
tmp = df.groupby(np.arange(len(df))//3).agg(' '.join)
tmp.columns = ['value', 'label']
tmp.to_dict(orient='records')
Output:
[{'value': 'a b c', 'label': 'tsla msft f'},
{'value': 'x y z', 'label': 'aapl aa gg'},
{'value': 'q w e', 'label': 'amd ba c'}]
I am trying to combine hundreds of CSVs together in python using the following code:
import os
import pandas as pd
import glob
path = '/Users/parkerbolstad/Downloads/'
all_files = glob.glob(os.path.join(path, "*.csv"))
df_from_each_file = (pd.read_csv(f, sep=',') for f in all_files)
df_merged = pd.concat(df_from_each_file, axis=1, ignore_index=False)
df_merged.to_csv( "merged.csv")
But, this combines all the files together in totality. The first column of each file is dates. I want to pull the dates from the first file and then skip them for the rest.
As of now, I have a new column with dates in it every 4 columns.
Simply run for-loop to remove this column in all df except first - [1:]
for df in df_from_each_file[1:]:
df.drop('date', axis=1, inplace=True)
import pandas as pd
df1 = pd.DataFrame({
'date': ['2021.08.01', '2021.08.02', '2021.08.03'],
'value': ['A', 'B', 'C']
})
df2 = pd.DataFrame({
'date': ['2021.08.01', '2021.08.02', '2021.08.03'],
'value': ['X', 'Y', 'Z']
})
df3 = pd.DataFrame({
'date': ['2021.08.01', '2021.08.02', '2021.08.03'],
'value': ['1', '2', '3']
})
df_from_each_file = [df1, df2, df3]
for df in df_from_each_file[1:]:
df.drop('date', axis=1, inplace=True)
result = pd.concat(df_from_each_file, axis=1)
print(result)
Result:
date value value value
0 2021.08.01 A X 1
1 2021.08.02 B Y 2
2 2021.08.03 C Z 3
Or in all df convert column with date into index and later reset index.
This should correctly connect rows if dates will be in different rows or there will be missing dates.
import pandas as pd
df1 = pd.DataFrame({
'date': ['2021.08.01', '2021.08.02', '2021.08.03'],
'value': ['A', 'B', 'C']
})
df2 = pd.DataFrame({
'date': ['2021.08.01', '2021.08.02', '2021.08.03'],
'value': ['X', 'Y', 'Z']
})
df3 = pd.DataFrame({
'date': ['2021.08.01', '2021.08.02', '2021.08.03'],
'value': ['1', '2', '3']
})
df_from_each_file = [df1, df2, df3]
for df in df_from_each_file:
df.index = df['date']
df.drop('date', axis=1, inplace=True)
result = pd.concat(df_from_each_file, axis=1)
#result = result.sort_index()
result = result.reset_index()
print(result)
How do I calculate the probability of each path occurring in a graph?
For example I have a graph
df = pd.DataFrame({'column_id':[1,1,1,2,2],
'sequence':['hi','how are you','bye','hi','bye'],
'person':['A','B','A','B','A'],
'type':['Friendly','Friendly','Friendly','Mean','Mean']})
df['source'] = df['sequence']
df['target'] = df.groupby('column_id')['sequence'].transform(lambda x: x.shift(-1))
df1 = df[df['target'].notna()]
df1 = df1[['source','target','person','type']]
df1 = df[df['target'].notna()]
df1 = df1.drop(['sequence'],axis=1)
df1.loc[len(df1.index)+1] = [3, 'A', 'Mean', 'hi', 'run away']
df1.loc[len(df1.index)+1] = [3, 'A', 'Mean', 'run away', 'how are you']
df1.loc[len(df1.index)+1] = [3, 'A', 'Mean', 'how are you', 'bye']
df1.loc[len(df1.index)+1] = [4, 'A', 'Friendly', 'hi', 'how are you']
df1.loc[len(df1.index)+1] = [4, 'A', 'Friendly', 'how are you', 'bye']
df1.loc[len(df1.index)+1] = [5, 'A', 'Friendly', 'hi', 'runaway']
df1.loc[len(df1.index)+1] = [5, 'A', 'Friendly', 'run away', 'how are you']
df1.loc[len(df1.index)+1] = [5, 'A', 'Friendly', 'how are you', 'this is me']
df1.loc[len(df1.index)+1] = [5, 'A', 'Friendly', 'this is me', 'bye']
df1 = df1.reset_index().drop(['index'], axis=1)
df2 = df1.groupby(['source','target']).size().reset_index()
df2 = df2.drop_duplicates(subset=['source','target'],keep='last')
df3 = pd.merge(df1,df2, on=['source','target'], how='left')
df3 = df3.drop('column_id', axis=1)
df3.rename(columns={0:'weight'}, inplace=True)
df3['probability'] = df3['weight']
df3['probability'] = df3['probability']/df3['probability'].sum()
G = nx.from_pandas_edgelist(df3,
source = 'source',
target = 'target',
edge_attr=['person','type','probability'],
create_using=nx.DiGraph())
I want to calculate the probability of each path that is given from
sp = nx.all_simple_paths(G, source='hi', target='bye')
I tried multiplying the probability of each path but the results are not correct.
I have a Pandas DataFrame which contains an ID, Code and Date. For certain codes I would like to fill subsequent appearances of the ID, based on the date, with a determined set of missing codes. I would also like to know the first appearance of the code against the respective ID.
Example as follows, NB: missing codes are A and B (only codes A and B carry over):
import pandas as pd
d = {'ID': [1, 2, 1, 2, 3, 1], 'date': ['2017-03-22', '2017-03-21', '2017-03-23', '2017-03-24', '2017-03-28', '2017-03-28'], 'Code': ['A, C', 'A', 'B, C', 'E, D', 'A', 'C']}
df = pd.DataFrame(data=d)
# only A and B codes carry over
df
The target dataframe would ideally look as follows:
import pandas as pd
d = {'ID': [1, 2, 1, 2, 3, 1], 'date': ['2017-03-22', '2017-03-21', '2017-03-24', '2017-03-22', '2017-03-28', '2017-03-28'], 'Code': ['A, C', 'A', 'B, C', 'E, D', 'A', 'C'], 'Missing_code': ['', '', 'A', 'A', '', 'A, B'], 'First_code_date': ['', '', '2017-03-22', '2017-03-21', '', '2017-03-23, 2017-03-24']}
df = pd.DataFrame(data=d)
df
Note I am not fussy on how the 'First_code_date' looks providing it is dynamic as the code length may increase or decrease.
If the example is not clear please let me know and I will adjust.
Thank you for help.