Related
Say I have two DataFrames
df1 = pd.DataFrame({'A':[1,2], 'B':[3,4]}, index = [0,1])
df2 = pd.DataFrame({'B':[8,9], 'C':[10,11]}, index = [1,2])
I want to merge so that any values in df1 are overwritten in there is a value in df2 at that location and any new values in df2 are added including the new rows and columns.
The result should be:
A B C
0 1 3 nan
1 2 8 10
2 nan 9 11
I've tried combine_first but that causes only nan values to be overwritten
updated has the issue where new rows are created rather than overwritten
merge has many issues.
I've tried writing my own function
def take_right(df1, df2, j, i):
print (df1)
print (df2)
try:
s1 = df1[j][i]
except:
s1 = np.NaN
try:
s2 = df2[j][i]
except:
s2 = np.NaN
if math.isnan(s2):
#print(s1)
return s1
else:
# print(s2)
return s2
def combine_df(df1, df2):
rows = (set(df1.index.values.tolist()) | set(df2.index.values.tolist()))
#print(rows)
columns = (set(df1.columns.values.tolist()) | set(df2.columns.values.tolist()))
#print(columns)
df = pd.DataFrame()
#df.columns = columns
for i in rows:
#df[:][i]=[]
for j in columns:
df = df.insert(int(i), j, take_right(df1,df2,j,i), allow_duplicates=False)
# print(df)
return df
This won't add new columns or rows to an empty DataFrame.
Thank you!!
One approach is to create an empty output dataframe with the union of columns and indices from df1 and df2 and then use the df.update method to assign their values into the out_df
import pandas as pd
df1 = pd.DataFrame({'A':[1,2], 'B':[3,4]}, index = [0,1])
df2 = pd.DataFrame({'B':[8,9], 'C':[10,11]}, index = [1,2])
out_df = pd.DataFrame(
columns = df1.columns.union(df2.columns),
index = df1.index.union(df2.index),
)
out_df.update(df1)
out_df.update(df2)
out_df
Why does combine_first not work?
df = df2.combine_first(df1)
print(df)
Output:
A B C
0 1.0 3 NaN
1 2.0 8 10.0
2 NaN 9 11.0
I would like to add a new multiindex in between the already existing indexes 'Warnings' and 'equip' with the sum of the column 'count per equip' for each 'Warnings' level.
idx = pd.MultiIndex.from_product([['warning1', 'warning2', 'warning3'],
['ff0001', 'ff0002', 'ff0003']],
names=['Warnings', 'equip'])
col = ['count per equip']
df = pd.DataFrame([100,2,1,44,45,20,25,98,0], idx, col)
df
So the resulting dataframe would have the same number of index in level 0, 'Warnings', and for this example it would be [103, 109, 123], respectively.
I've managed to sum and insert the index at the right place, but when trying to do all together, all values are NaN's:
df = df.assign(total=df.groupby(level=[0]).size()).set_index('total', append=True).reorder_levels(['Warnings','total','equip'])
In assign we can't do groupby. So, the following code create similar data.
idx = pd.MultiIndex.from_product([['warning1', 'warning2', 'warning3'],
['ff0001', 'ff0002', 'ff0003']],
names=['Warnings', 'equip'])
col = ['count per equip']
df = pd.DataFrame([100,2,1,44,45,20,25,98,0], idx, col)
Grouping based on level = 0
df['total'] = df.groupby(level=0).transform(lambda x: x.size)
df = df.set_index('total', append=True).reorder_levels(['Warnings','total','equip'])
print(df)
count per equip
Warnings total equip
warning1 3 ff0001 100
ff0002 2
ff0003 1
warning2 3 ff0001 44
ff0002 45
ff0003 20
warning3 3 ff0001 25
ff0002 98
ff0003 0
I have to dataframes and I am using pandas.
I want to do a cumulative sum from a variable date and by the value in a column
I want to add a second column to df2 that show the date to know the day when the sum of the AVG column is greater than 100 after date2 in df2.
For example with df1 and df2 being the dataframe I start with and df3 what I want and df3['date100'] is the day the sum of avg is greater than 100:
df1 = pd.DataFrame({'date1': ['1/1/2014', '2/1/2014', '3/1/2014','1/1/2014', '2/1/2014', '3/1/2014','1/1/2014', '2/1/2014', '3/1/2014'],
'Place':['A','A','A','B','B','B','C','C','C'],'AVG': [62,14,47,25,74,60,78,27,41]})
df2 = pd.DataFrame({'date2': ['1/1/2014', '2/1/2014'], 'Place':['A','C'])})
*Something*
df3 = pd.DataFrame({'date2': ['1/1/2014', '2/1/2014'], 'Place':['A','C'], 'date100': ['3/1/2014', '2/1/2014'], 'sum': [123, 105]})
I found some answers but most them use groupby and df2 has no groups.
Since your example is very basic, if you have edge cases you want me to take care of, just ask. This solution implies that :
The solution :
# For this solution your DataFrame needs to be sorted by date.
limit = 100
df = pd.DataFrame({
'date1': ['1/1/2014', '2/1/2014', '3/1/2014','1/1/2014',
'2/1/2014', '3/1/2014','1/1/2014', '2/1/2014', '3/1/2014'],
'Place':['A','A','A','B','B','B','C','C','C'],
'AVG': [62,14,47,25,74,60,78,27,41]})
df2 = pd.DataFrame({'date2': ['1/1/2014', '2/1/2014'], 'Place':['A','C']})
result = []
for row in df2.to_dict('records'):
# For each date, I want to select the date that comes AFTER this one.
# Then, I take the .cumsum(), because it's the agg you wish to do.
# Filter by your limit and take the first occurrence.
# Converting this to a dict, appending it to a list, makes it easy
# to rebuild a DataFrame later.
ndf = df.loc[ (df['date1'] >= row['date2']) & (df['Place'] == row['Place']) ]\
.sort_values(by='date1')
ndf['avgsum'] = ndf['AVG'].cumsum()
final_df = ndf.loc[ ndf['avgsum'] >= limit ]
# Error handling, in case there is not avgsum above the threshold.
try:
final_df = final_df.iloc[0][['date1', 'avgsum']].rename({'date1' : 'date100'})
result.append( final_df.to_dict() )
except IndexError:
continue
df3 = pd.DataFrame(result)
final_df = pd.concat([df2, df3], axis=1, sort=False)
print(final_df)
# date2 Place avgsum date100
# 0 1/1/2014 A 123.0 3/1/2014
# 1 2/1/2014 C NaN NaN
Here is a direct solution, with following assumptions:
df1 is sorted by date
one solution exists for every date in df2
You can then do:
df2 = df2.join(pd.concat([
pd.DataFrame(pd.DataFrame(df1.loc[df1.date1 >= d].AVG.cumsum()).query('AVG>=100')
.iloc[0]).transpose()
for d in df2.date2]).rename_axis('ix').reset_index())\
.join(df1.drop(columns='AVG'), on='ix').rename(columns={'AVG': 'sum', 'date1': 'date100'})\
.drop(columns='ix')[['date2', 'date100', 'sum']]
This does the following:
for each date in df2 find the first date when the cumul on AVG will be at least 100
combine the results in one single dataframe indexed by the index of that line in df1
store that index in an ix column and reset the index to join that dataframe to df2
join that to df1 minus the AVG column using the ix column
rename the columns, remove the ix column, and re-order everything
I have a dataframe such as:
label column1
a 1
a 2
b 6
b 4
I would like to make a dataframe with a new column, with the opposite value from column1 where the labels match. Such as:
label column1 column2
a 1 2
a 2 1
b 6 4
b 4 6
I know this is probably very simple to do with a groupby command but I've been searching and can't find anything.
The following uses groupby and apply and seems to work okay:
x = pd.DataFrame({ 'label': ['a','a','b','b'],
'column1': [1,2,6,4] })
y = x.groupby('label').apply(
lambda g: g.assign(column2 = np.asarray(g.column1[::-1])))
y = y.reset_index(drop=True) # optional: drop weird index
print(y)
you can try the code block below:
#create the Dataframe
df = pd.DataFrame({'label':['a','a','b','b'],
'column1':[1,2,6,4]})
#Group by label
a = df.groupby('label').first().reset_index()
b = df.groupby('label').last().reset_index()
#Concat those groups to create columns2
df2 = (pd.concat([b,a])
.sort_values(by='label')
.rename(columns={'column1':'column2'})
.reset_index()
.drop('index',axis=1))
#Merge with the original Dataframe
df = df.merge(df2,left_index=True,right_index=True,on='label')[['label','column1','column2']]
Hope this helps
Assuming their are only pairs of labels, you could use the following as well:
# Create dataframe
df = pd.DataFrame(data = {'label' :['a', 'a', 'b', 'b'],
'column1' :[1,2, 6,4]})
# iterate over dataframe, identify matching label and opposite value
for index, row in df.iterrows():
newvalue = int(df[(df.label == row.label) & (df.column1 != row.column1)].column1.values[0])
# set value to new column
df.set_value(index, 'column2', newvalue)
df.head()
You can use groupby with apply where create new Series with back order:
df['column2'] = df.groupby('label')["column1"] \
.apply(lambda x: pd.Series(x[::-1].values)).reset_index(drop=True)
print (df)
column1 label column2
0 1 a 2
1 2 a 1
2 6 b 4
3 4 b 6
Lets say one has a DataFrame df1 with INDEX, Column1, Column2 and another df2 with INDEX, Column1, Column3.
Both INDEX have similar values so I want to use that to merge the information of one table on the other.
I have been told to do as follows by other users:
df1.update(df2, join='left', overwrite=True)
This works if both INDEXES have similar values. The result will be df1 will now have INDEX, Column1 (from df2) and Column2 (original from df1). Column3 is not added to df1 (this behaviour is wanted vs. the "merge" command that adds everything).
Now, I would like to update df1 only on a few cases and based on Column2. I thought this would work:
df1[df1['Column2'] == 'Cond'].update(df2, join='left', overwrite=True)
But it doesn't; sometimes I get an error, other the command works but ALL df1 values have been modified.
Any idea on how to do this?
PS: Using .loc won't work as that requires that whatever INDEX you search for exists and this is not the case.
EDIT: Additional example
In [37]: df1 = pd.DataFrame([['USA',1],['USA',2],['USA',3],['FRA',1],['FRA',2]], columns = ['country', 'value'])
In [38]: df2 = pd.DataFrame([['USA',10],['FRA',20]], columns = ['country', 'value'])
In [39]: df1 = df1.set_index('country')
In [40]: df2 = df2.set_index('country')
In [41]: mask = df1['value'] >= 2
In [42]: idx = df1.index[mask]
In [43]: idx = idx.unique()
In [44]: df1
Out[44]:
value
country
USA 1
USA 2
USA 3
FRA 1
FRA 2
In [45]: df2
Out[45]:
value
country
USA 10
FRA 20
In [46]: idx
Out[46]: array(['USA', 'FRA'], dtype=object)
In [47]: df1.update(df2.loc[idx])
In [48]: df1
Out[48]:
value
country
USA 10
USA 10
USA 10
FRA 20
FRA 20
Define the boolean mask
mask = (df1['Column2'] == 'Cond')
If df1.index is identical to df2.index, then mask can be used to select
rows from df2 -- i.e., df2.loc[mask]. But if they are not identical, then
df2.loc[mask] may raise an error (if len(df1) != len(df2)), or worse, silently select the wrong rows
because the boolean mask is not aligning index values between df1 and df2.
So in the more general case when the indexes are not identical, the trick is to
convert the boolean mask into an Index that can be used to restrict
df2.
If df1.index is unique then call df1.update on the restricted df2:
idx = df1.index[mask]
df1.update(df2.loc[idx])
For example,
import pandas as pd
df1 = pd.DataFrame({'Column1':[1,2,3], 'Column2':['Cond',5,'Cond']}, index=['A','B','C'])
# Column1 Column2
# A 1 Cond
# B 2 5
# C 3 Cond
df2 = pd.DataFrame({'Column1':[10,20,30], 'Column3':[40,50,60]}, index=['D','B','C'])
# Column1 Column3
# D 10 40
# B 20 50
# C 30 60
mask = df1['Column2'] == 'Cond'
idx = df1.index[mask]
df1.update(df2.loc[idx])
print(df1)
prints
Column1 Column2
A 1 Cond
B 2 5
C 30 Cond
If df1.index is not unique, then make the index unique by adding mask to it:
df1['mask'] = df1['value'] >= 2
df2['mask'] = True
df1 = df1.set_index('mask', append=True)
df2 = df2.set_index('mask', append=True)
Then calling df1.update(df2) produces the desired result because update aligns indices.
For example,
import pandas as pd
df1 = pd.DataFrame([['USA',1],['USA',2],['USA',3],['FRA',1],['FRA',2]],
columns = ['country', 'value'])
df2 = pd.DataFrame([['USA',10],['FRA',20]], columns = ['country', 'value'])
df1 = df1.set_index('country')
# value
# country
# USA 1
# USA 2
# USA 3
# FRA 1
# FRA 2
df2 = df2.set_index('country')
# value
# country
# USA 10
# FRA 20
df1['mask'] = df1['value'] >= 2
df2['mask'] = True
df1 = df1.set_index('mask', append=True)
# value
# country mask
# USA False 1
# True 2
# True 3
# FRA False 1
# True 2
df2 = df2.set_index('mask', append=True)
# value
# country mask
# USA True 10
# FRA True 20
df1.update(df2)
df1.index = df1.index.droplevel('mask')
print(df1)
yields
value
country
USA 1
USA 10
USA 10
FRA 1
FRA 20