I'm trying to substract a df "stock_returns" (144 rows x 517 col) by a df "p_df" (144 rows x 1 col).
I have tried;
stock_returns - p_df
stock_returns.rsub(p_df,axis=1)
stock_returns.substract(p_df)
But none of them work and all return Nan values.
I'm passing it through this fnc, and using the for loop to get args:
def disp_calc(returns, p, wi): #apply(disp_calc, rows = ...)
wi = wi/np.sum(wi)
rp = (col_len(returns)*(returns-p)**2).sum() #returns - p causing problems
return np.sqrt(rp)
for i in sectors:
stock_returns = returns_rolling[sectordict[i]]#.apply(np.mean,axis=1)
portfolio_return = returns_rolling[i]; p_df = portfolio_return.to_frame()
disp_df[i] = stock_returns.apply(disp_calc,args=(portfolio_return,wi))
My expected output is to subtract all 517 cols in the first df by the 1 col in p_df. so final results would still have 517 cols. Thanks
You're almost there, just need to set axis=0 to subtract along the indexes:
>>> stock_returns = pd.DataFrame([[10,100,200],
[15, 115, 215],
[20,120, 220],
[25,125,225],
[30,130,230]], columns=['A', 'B', 'C'])
>>> stock_returns
A B C
0 10 100 200
1 15 115 215
2 20 120 220
3 25 125 225
4 30 130 230
>>> p_df = pd.DataFrame([1,2,3,4,5], columns=['P'])
>>> p_df
P
0 1
1 2
2 3
3 4
4 5
>>> stock_returns.sub(p_df['P'], axis=0)
A B C
0 9 99 199
1 13 113 213
2 17 117 217
3 21 121 221
4 25 125 225
data['new_col3'] = data['col1'] - data['col2']
Related
I want to combine rows in pandas df with the following logic:
dataframe is grouped by users
rows are ordered by start_at_min
rows are combiend when:
Case A:
if start_at_min<=200:
row1[stop_at_min] - row2[start_at_min] < 5
(eg: 101 -100 = 1 -> combine; 200-100=100: -> dont combine)
Case Bif 200> start_at_min<400:
change threhsold to 3
Case C if start_at_min>400:
Never combine
Example df
user start_at_min stop_at_min
0 1 100 150
1 1 152 201 #row0 with row1 combine
2 1 205 260 #row1 with row 2 NO -> start_at_min above 200 -> threshol =3
3 2 65 100 #no
4 2 200 265 #no
5 2 300 451 #no
6 2 452 460 #no -> start_at_min above 400-> never combine
Expected output:
user start_at_min stop_at_min
0 1 100 201 #row1 with row2 combine
2 1 205 260 #row2 with row 3 NO -> start_at_min above 200 -> threshol =3
3 2 65 100 #no
4 2 200 265 #no
5 2 300 451 #no
6 2 452 460 #no -> start_at_min above 400-> never combine
I have written the funciton combine_rows, that takes in 2 Series and applies this logic
def combine_rows (s1:pd.Series, s2:pd.Series):
# take 2 rows and combine them if start_at_min row2 - stop_at_min row1 < 5
if s2['start_at_min'] - s1['stop_at_min'] <5:
return pd.Series({
'user': s1['user'],
'start_at_min': s1['start_at_min'],
'stop_at_min' : s2['stop_at_min']
})
else:
return pd.concat([s1,s2],axis=1).T
Howver I am unable to apply this function to the dataframe.
This was my attempt:
df.groupby('user').sort_values(by=['start_at_min']).apply(combine_rows) # this not working
Here is the full code:
import pandas as pd
import numpy as np
df = pd.DataFrame({
"user" : [1, 1, 2,2],
'start_at_min': [60, 101, 65, 200],
'stop_at_min' : [100, 135, 100, 265]
})
def combine_rows (s1:pd.Series, s2:pd.Series):
# take 2 rows and combine them if start_at_min row2 - stop_at_min row1 < 5
if s2['start_at_min'] - s1['stop_at_min'] <5:
return pd.Series({
'user': s1['user'],
'start_at_min': s1['start_at_min'],
'stop_at_min' : s2['stop_at_min']
})
else:
return pd.concat([s1,s2],axis=1).T
df.groupby('user').sort_values(by=['start_at_min']).apply(combine_rows) # this not working
version 1: one condition
Perform a custom groupby.agg:
threshold = 5
# if the successive stop/start per group are above threshold
# start a new group
group = (df['start_at_min']
.sub(df.groupby('user')['stop_at_min'].shift())
.ge(threshold).cumsum()
)
# groupby.agg
out = (df.groupby(['user', group], as_index=False)
.agg({'start_at_min': 'min',
'stop_at_min': 'max'
})
)
Output:
user start_at_min stop_at_min
0 1 60 135
1 2 65 100
2 2 200 265
Intermediate:
(df['start_at_min']
.sub(df.groupby('user')['stop_at_min'].shift())
)
0 NaN
1 1.0 # below threshold, this will be merged
2 NaN
3 100.0 # above threshold, keep separate
dtype: float64
version 2: multiple conditions
# define variable threshold
threshold = np.where(df['start_at_min'].le(200), 5, 3)
# array([3, 3, 5, 3, 3, 5, 5])
# compute the new starts of group like in version 1
# but using the now variable threshold
m1 = (df['start_at_min']
.sub(df.groupby('user')['stop_at_min'].shift())
.ge(threshold)
)
# add a second restart condition (>400)
m2 = df['start_at_min'].gt(400)
# if either mask is True, start a new group
group = (m1|m2).cumsum()
# groupby.agg
out = (df.groupby(['user', group], as_index=False)
.agg({'start_at_min': 'min',
'stop_at_min': 'max'
})
)
Output:
user start_at_min stop_at_min
0 1 100 201
1 1 205 260
2 2 65 100
3 2 200 265
4 2 300 451
5 2 452 460
I am struggling to sort a pivot table according to one level of a MultiIndex.
My target is to sort the values in the level according to a list of values which basically works.
But i also want to preserve the original order of the other levels.
import pandas as pd
import numpy as np
import random
group_size = 3
n = 10
df = pd.DataFrame({
'i_a': list(np.arange(0, group_size))*n,
'i_b': random.choices(list("ARBMC"), k=n*group_size),
'value': np.random.randint(0, 100, size=n*group_size),
})
pt = pd.pivot_table(
df,
index=['i_a', 'i_b'],
values=['value'],
aggfunc='sum'
)
# The pivot table looks like this
value
i_a i_b
0 A 48
B 55
C 161
M 41
R 126
1 A 60
B 236
C 99
M 30
R 202
2 A 22
B 144
C 30
M 146
R 168
# defined order for i_b
ORDER = {
"A": 0,
"R": 1,
"B": 2,
"M": 3,
"C": 4,
}
def order_by_list(value, ascending=True):
try:
idx = ORDER[value]
except KeyError:
# place items which are not available at the last place
idx = len(ORDER)
if not ascending:
# reverse the order
idx = -idx
return idx
def sort_by_ib(df):
return pt.sort_index(level=["i_b"],
key=lambda index: index.map(order_by_list),
sort_remaining=False
)
pt_sorted = pt.pipe(sort_by_ib)
# i_a index of pt_sorted is rearranged what i dont want
value
i_a i_b
0 A 48
1 A 60
2 A 22
0 R 126
1 R 202
2 R 168
0 B 55
1 B 236
2 B 144
0 M 41
1 M 30
2 M 146
0 C 161
1 C 99
2 C 30
# Instead, The sorted pivot table should look like this
value
i_a i_b
0 A 48
R 126
B 55
M 41
C 161
1 A 60
R 202
B 236
M 30
C 99
2 A 22
R 168
B 144
M 146
C 30
What is the preferred/recommended way to do this?
If want change order you can crete helper column for mapping, add to index parameter in pivot_table and last remove by droplevel. If added before i_b it is sorting by id_a and new levels:
df['new'] = df['i_b'].map(ORDER)
pt = pd.pivot_table(
df,
index=[ 'i_a','new', 'i_b'],
values=['value'],
aggfunc='sum'
).droplevel(1)
print (pt)
value
i_a i_b
0 A 217
R 135
M 150
C 43
1 A 44
R 266
B 44
M 13
C 128
2 A 167
R 3
B 85
M 159
C 81
I am aware of this link but I didn't manage to solve my problem.
I have this below DataFrame from pandas.DataFrame.groupby().sum():
Value
Level Company Item
1 X a 100
b 200
Y a 35
b 150
c 35
2 X a 48
b 100
c 50
Y a 80
and would like to add total rows for each level of index that I have to get:
Value
Level Company Item
1 X a 100
b 200
Total 300
Y a 35
b 150
c 35
Total 520
Total 820
2 X a 48
b 100
c 50
Total 198
Y a 80
Total 80
Total 278
Total 1098
As request
level = list(map(int, list('111112222')))
company = list('XXYYYXXXY')
item = list('ababcabca')
value = [100,200,35,150,35,48,100,50,80]
col = ['Level', 'Company', 'Item', 'Value']
df = pd.DataFrame([level,company,item,value]).T
df.columns = col
df.groupby(['Level', 'Company', 'Item']).sum()
You can use:
m=df.groupby(['Level','Company','Item'])['Value'].sum().unstack()
m.assign(total=m.sum(1)).stack().to_frame('Value')
Value
Level Company Item
1 X a 100.0
b 200.0
total 300.0
Y a 35.0
b 150.0
c 35.0
total 220.0
2 X a 48.0
b 100.0
c 50.0
total 198.0
Y a 80.0
total 80.0
Try this: Basically, this is creating two new dfs from the using the sum of the two groups and concating the three data frames
level = list(map(int, list('111112222')))
company = list('XXYYYXXXY')
item = list('ababcabca')
value = [100,200,35,150,35,48,100,50,80]
col = ['Level', 'Company', 'Item', 'Value']
df = pd.DataFrame([level,company,item,value]).T
df.columns = col
df1 = (df.groupby(['Level', 'Company', 'Item'])['Value'].sum())
df2 = (df1.sum(level=0).to_frame().assign(Company='total').set_index('Company', append=True))
df3 = (df1.groupby(['Level','Company']).sum().to_frame().assign(Item='total').set_index('Item', append=True))
dfx = pd.concat([df1.to_frame().reset_index(),
df2.reset_index(),
df3.reset_index()],sort=False)
print(dfx)
Output:
Level Company Item Value
0 1 X a 100
1 1 X b 200
2 1 Y a 35
3 1 Y b 150
4 1 Y c 35
5 2 X a 48
6 2 X b 100
7 2 X c 50
8 2 Y a 80
0 1 total NaN 520
1 2 total NaN 278
0 1 X total 300
1 1 Y total 220
2 2 X total 198
3 2 Y total 80
This is not sorted though as you expect.
If I concat the 3 dfs without resetting the index I'm getting the expected sort order, but the index is a multi-index column
dfx = pd.concat([df1.to_frame(), df2, df3]).sort_index()
Output
Value
(1, X, a) 100
(1, X, b) 200
(1, X, total) 300
(1, Y, a) 35
(1, Y, b) 150
(1, Y, c) 35
(1, Y, total) 220
(1, total) 520
(2, X, a) 48
(2, X, b) 100
(2, X, c) 50
(2, X, total) 198
(2, Y, a) 80
(2, Y, total) 80
(2, total) 278
I am not sure how to convert this to columns as in your df.
You can try stacking it one level at a time:
m = df.groupby(['Level','Company','Item'])['Value'].sum().unstack(level=['Company','Item'])
m = m.assign(total=m.sum(1))
m = m.stack(level='Company')
m = m.assign(total=m.sum(1))
m = m.stack(level='Item')
The output has duplicate totals though:
Level Company Item
1 X a 100.0
b 200.0
total 300.0
Y a 35.0
b 150.0
c 35.0
total 220.0
total 520.0
total 520.0
2 X a 48.0
b 100.0
c 50.0
total 198.0
Y a 80.0
total 80.0
total 278.0
total 278.0
dtype: float64
I have used some logic to iterate my tsv file in such a way that every text group will be given a group number. This code you can find in the answers, i have changed the question which i had earlier, with the help of #Jeril I was able to get this code.
import pandas as pd
import numpy as np
main_df = pd.read_csv('sampleOutput.tsv', delimiter='\t')
main_df = main_df.dropna(subset=['text'], axis=0)
final_df = pd.DataFrame()
for page_no in main_df['page_num'].unique():
df = main_df[main_df['page_num'] == page_no].copy(deep=True)
df['top'] = df['top'].astype(int)
df['bool'] = (df['top'] - df['top'].shift(-1)) < -50
df.loc[df['bool'] == True, 'group'] = range(
1, (df['bool'] == True).sum() + 1)
df['group'] = df['group'].replace({0: np.nan}).bfill()
df['group'] = df['group'].fillna((df['bool'] == True).sum() + 1)
final_df = pd.concat([final_df, df])
print(final_df)
So this is my table of which i have to get output of, the logic i have used is assuming threshold as (300,50), every text group will have a different groupNo, this i am doing so that i can display my output in a more sensible way.
Sentence grouping:
a) Words on same line are grouped if x distance < threshold
b) Words on next line are grouped with previous if y distance < threshold
x = current_left - previous_left (left refers to the "left" column)
y = current_line_top - previous_line_top (top refers to the "top" column)
INPUT:-
EXPECTED OUTPUT:-
This is an example as to write the whole input again is difficult, basically every logical sentence will be in the same group, the logic is constant
x = current line on 'left' column - previous line on 'left' column (left refers to the "left" column)
x < Threshold(300)
y = current_line_top_column - previous_line_top_column (top refers to the "top" column)
y < Threshold(50)
I am not able to implement this logic, if anyone can help!
page_num block_num line_num word_num left top width text groupNo
1 27 1 1 405 420 129 Property 1
1 27 1 2 543 420 31 of 1
1 27 1 3 578 420 159 Accenture 1
1 27 4 1 409 581 105 INTERPRET 2
1 27 4 2 520 581 90 DRAWING 2
1 27 4 3 616 581 38 PER 2
1 27 4 4 659 581 113 APPLICABLE 2
1 27 4 5 779 581 267 STANDARD: 2
1 27 5 1 411 603 114 Accenture 2
1 27 5 2 532 603 84 ACCOO1 2
2 46 1 1 480 800 114 yoyoyo 3
2 46 1 2 550 800 84 heloo 3
Please ask if the question is not clear.
Can you try the following:
main_df = pd.read_csv('codebeautify.tsv', delimiter='\t')
final_df = pd.DataFrame()
for page_no in main_df['page_num'].unique():
df = main_df[main_df['page_num'] == page_no].copy(deep=True)
df['top'] = df['top'].astype(int)
df['bool'] = ((df['top'] - df['top'].shift(-1)).abs() >
50) != ((df['left'] - df['left'].shift(-1)).abs() > 350)
df.loc[df['bool'] == True, 'group'] = range(
1, (df['bool'] == True).sum() + 1)
final_df = pd.concat([final_df, df])
final_df['group'] = final_df['group'].replace({0: np.nan}).bfill()
final_df['group'] = final_df['group'].fillna((final_df['bool'] == True).sum() + 1)
Sample Output:
left top text bool group
0 405 420 Property False 1.0
1 543 420 of False 1.0
2 578 420 Accenture True 1.0
3 409 581 INTERPRET False 2.0
4 520 581 DRAWING False 2.0
5 616 581 PER False 2.0
6 659 581 APPLICABLE False 2.0
7 779 581 STANDARD True 2.0
8 411 603 Accenture False 3.0
9 532 603 ACCOO1 True 3.0
10 480 800 yoyoyo False 4.0
11 550 800 heloo False 4.0
I have a multi-indexed dataframe and I wish to extract a subset based on index values and on a boolean criteria. I wish to overwrite the values of a specific new values using multi-index keys and boolean indexers to select the records to modify.
import pandas as pd
import numpy as np
years = [1994,1995,1996]
householdIDs = [ id for id in range(1,100) ]
midx = pd.MultiIndex.from_product( [years, householdIDs], names = ['Year', 'HouseholdID'] )
householdIncomes = np.random.randint( 10000,100000, size = len(years)*len(householdIDs) )
householdSize = np.random.randint( 1,5, size = len(years)*len(householdIDs) )
df = pd.DataFrame( {'HouseholdIncome':householdIncomes, 'HouseholdSize':householdSize}, index = midx )
df.sort_index(inplace = True)
Here's what the sample data looks like...
df.head()
=> HouseholdIncome HouseholdSize
Year HouseholdID
1994 1 23866 3
2 57956 3
3 21644 3
4 71912 4
5 83663 3
I'm able to successfully query the dataframe using the indices and column labels.
This example gives me the HouseholdSize for household 3 in year 1996
df.loc[ (1996,3 ) , 'HouseholdSize' ]
=> 1
However, I'm unable to combine boolean selection with multi-index queries...
The pandas docs on Multi-indexing says there is a way to combine boolean indexing with multi-indexing and gives an example...
In [52]: idx = pd.IndexSlice
In [56]: mask = dfmi[('a','foo')]>200
In [57]: dfmi.loc[idx[mask,:,['C1','C3']],idx[:,'foo']]
Out[57]:
lvl0 a b
lvl1 foo foo
A3 B0 C1 D1 204 206
C3 D0 216 218
D1 220 222
B1 C1 D0 232 234
D1 236 238
C3 D0 248 250
D1 252 254
...which I can't seem to replicate on my dataframe
idx = pd.IndexSlice
housholdSizeAbove2 = ( df.HouseholdSize > 2 )
df.loc[ idx[ housholdSizeAbove2, 1996, :] , 'HouseholdSize' ]
Traceback (most recent call last):
File "python", line 1, in <module>
KeyError: 'MultiIndex Slicing requires the index to be fully lexsorted tuple len (3), lexsort depth (2)'
In this example I would want to see all the households in 1996 with householdsize above 2
Pandas.query() should work in this case:
df.query("Year == 1996 and HouseholdID > 2")
Demo:
In [326]: with pd.option_context('display.max_rows',20):
...: print(df.query("Year == 1996 and HouseholdID > 2"))
...:
HouseholdIncome HouseholdSize
Year HouseholdID
1996 3 28664 4
4 11057 1
5 36321 2
6 89469 4
7 35711 2
8 85741 1
9 34758 3
10 56085 2
11 32275 4
12 77096 4
... ... ...
90 40276 4
91 10594 2
92 61080 4
93 65334 2
94 21477 4
95 83112 4
96 25627 2
97 24830 4
98 85693 1
99 84653 4
[97 rows x 2 columns]
UPDATE:
Is there a way to select a specific column?
In [333]: df.loc[df.eval("Year == 1996 and HouseholdID > 2"), 'HouseholdIncome']
Out[333]:
Year HouseholdID
1996 3 28664
4 11057
5 36321
6 89469
7 35711
8 85741
9 34758
10 56085
11 32275
12 77096
...
90 40276
91 10594
92 61080
93 65334
94 21477
95 83112
96 25627
97 24830
98 85693
99 84653
Name: HouseholdIncome, dtype: int32
and ultimately I want to overwrite the data on the dataframe.
In [331]: df.loc[df.eval("Year == 1996 and HouseholdID > 2"), 'HouseholdSize'] *= 10
In [332]: df.loc[df.eval("Year == 1996 and HouseholdID > 2")]
Out[332]:
HouseholdIncome HouseholdSize
Year HouseholdID
1996 3 28664 40
4 11057 10
5 36321 20
6 89469 40
7 35711 20
8 85741 10
9 34758 30
10 56085 20
11 32275 40
12 77096 40
... ... ...
90 40276 40
91 10594 20
92 61080 40
93 65334 20
94 21477 40
95 83112 40
96 25627 20
97 24830 40
98 85693 10
99 84653 40
[97 rows x 2 columns]
UPDATE2:
I want to pass a variable year instead of a specific value. Is there
a cleaner way to do it than Year == " + str(year) + " and HouseholdID > " + str(householdSize) ?
In [5]: year = 1996
In [6]: household_ids = [1, 2, 98, 99]
In [7]: df.loc[df.eval("Year == #year and HouseholdID in #household_ids")]
Out[7]:
HouseholdIncome HouseholdSize
Year HouseholdID
1996 1 42217 1
2 66009 3
98 33121 4
99 45489 3