I'm creating a dataframe the following way:
filtered_data.groupby('weekday').agg({'airing': np.sum, 'uplift': [np.sum,np.mean]})
Which creates a table of:
sum sum mean
weekday
1 11 20 1.818182
2 24 46 1.916667
...
What I want is to include a final row which is the total for each column.
Thanks in advance!
You can use the .loc function in order to achieve that:
df.loc[len(df)] = [df[col].sum() for col in df.columns]
In this case, you should probably create a Series that keeps track of your summary stats. You could then concat if you needed to for display purposes.
summary = pd.Series([filtered_data.airing.sum(),
filtered_data.uplift.sum(),
filtered_data.uplift.mean()],
name='summary')
For this purpose I created an aggregation tool which behaves like GROUPING SETS in SQL. Supply columns by which to group and an aggregate function, and get back an aggregated DataFrame.
import itertools as it
import pandas as pd
from pandas.util.testing import assert_frame_equal
def powerset(iterable):
"powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
s = list(iterable)
return it.chain.from_iterable(it.combinations(s,r) for r in range(len(s)+1))
def grouper(df,grpby,aggfunc):
''' produces aggregate DataFrame from DataFrames for non-redundant groupings
`workingdf` is used to avoid modifying original DataFrame
'''
uniqcols = set(col for col in grpby if len(df[col].unique()) == 1)
subset = set()
for col in uniqcols:
for grp in powerset(grpby):
if col in grp:
subset.add(grp) # add level of aggregation only when non-redundant
if len(subset) == 0:
for grp in powerset(grpby):
subset.add(grp)
workingdf = df.copy()
for idx,i in enumerate(subset):
if i != ():
tmp = aggfunc( workingdf.groupby(i) )
else:
# hack to get output to be a DataFrameGroupBy object:
# insert dummy column on which to group by
dummycolname = hash(tuple(workingdf.columns.tolist()))
workingdf[dummycolname] = ''
tmp = aggfunc( workingdf.groupby(dummycolname) )
# drop the index and add it back
if i == (): tmp.reset_index(drop=True,inplace=True)
else: tmp.reset_index(inplace=True)
for j in grpby:
if j not in tmp: # if column is not in DataFrame add it
tmp[j] = '(All)'
# new list with all columns including aggregate ones; do this only once
if idx == 0:
finalcols = grpby[:]
addlcols = [k for k in tmp if k not in grpby] # aggregate columns
finalcols.extend(addlcols)
# reorder columns
tmp = tmp[finalcols]
if idx == 0:
final = tmp; del tmp
else:
final = pd.concat( [final,tmp] ); del tmp
del workingdf
final.sort_values(finalcols,inplace=True)
final.reset_index(drop=True,inplace=True)
return final
def agg(grpbyobj):
''' the purpose of this function is to:
specify aggregate operation(s) you wish to perform,
name the resulting column(s) in the final DataFrame.
'''
tmp = pd.DataFrame()
tmp['Total (n)'] = grpbyobj['Total'].sum()
return tmp
if __name__ == '__main__':
df = pd.DataFrame({'Area':['a','a','b',],
'Year':[2014,2014,2014,],
'Month':[1,2,3,],
'Total':[4,5,6,],})
final = grouper(df,grpby=['Area','Year'],aggfunc=agg)
# test against expected result
expected = pd.DataFrame({u'Year': {0: 2014, 1: 2014, 2: 2014},
u'Total (n)': {0: 15, 1: 9, 2: 6},
u'Area': {0: u'(All)', 1: u'a', 2: u'b'}})
expected = expected[final.columns.tolist()]
try:
# check_names kwarg True: compare indexes and columns
assert_frame_equal(final,expected,check_names=True)
except AssertionError as e:
raise
Related
I have a data frame as follows
I/P
date,low,high,close
d1,l1,h1,c1
d2,l2,h2,c2
d3,l3,h3,c3
d4,l4,h4,c4
d5,l5,h5,c5
d6,l6,h5,c5
d7,l7,h7,c7
O/P
d1,l1,h1,c1,d2,l2,h2,c2,d3,l3,h3,c3
d2,l2,h2,c2,d3,l3,h3,c3,d4,l5,h4,c4
d3,l3,h3,c3,d4,l5,h4,c4,d5,l5,h5,c5
d4,l5,h4,c4,d5,l5,h5,c5,d6,l6,h6,c6
....
Basically join all rows, split into subarrays of 3 size each staring at each index, and create the op data frame.
Following code works. Buts its too verbose and slow. Does pandas have something inbuilt for this?
def flatten(df):
candles = []
i = 0
while i < len(df):
candles.append(df.iloc[i])
i= i+1
return candles
def slide_and_expand(candles, k):
return [candles[i:i+k] for i in range(len(candles) - k + 1)]
def candle_to_dict(col_name_prefix, candle_series):
candle_dict = {}
for index, val in candle_series.iteritems():
col_name = col_name_prefix+index
candle_dict[col_name] = val
return candle_dict
def candle_group_to_feature_vector(candle_group):
feature_vector_dict = {}
i = 0
for candle in candle_group:
col_name_prefix = f"c{i}_"
candle_dict = candle_to_dict(col_name_prefix, candle)
feature_vector_dict.update(candle_dict)
i= i+1
return feature_vector_dict
def candle_groups_to_feature_vectors(candle_groups):
feature_vectors = []
for candle_group in candle_groups:
feature_vector = candle_group_to_feature_vector(candle_group)
feature_vectors.append(feature_vector)
return feature_vectors
fv_len = 3
candles = flatten(data)
candle_groups = slide_and_expand(candles,fv_len)
feature_vectors = candle_groups_to_feature_vectors(candle_groups)
data_fv = pd.DataFrame.from_dict(feature_vectors, orient='columns')
data_fv
You could do something like this:
n = len(df.index) # number of rows in original dataframe 'df'
df_0 = df.loc[0:n-3]
df_1 = df.loc[1:n-2]
df_2 = df.loc[2:n-1]
df_final = pandas.concat([df_0, df_1, df_2], axis = 1)
You can save a few steps using Pandas rolling function using the windows size as the desired subarray length (window=SUBARR_SZ). Then, join each column with a ,, transform the result to a Series to be able to apply a join again, but now using each row in the Series (which contains the specific amount of subarrays).
import pandas as pd
df = pd.read_csv('sample.csv')
SUBARR_SZ = 3 # subarray size
df_list = []
for w in df.rolling(window=SUBARR_SZ):
if len(w) == SUBARR_SZ:
s = w.apply(','.join, axis=1).apply(pd.Series).apply(','.join)
df_list.append(s)
dff = pd.concat(df_list).reset_index(drop=True)
print(dff)
Output from dff
0 d1,l1,h1,c1,d2,l2,h2,c2,d3,l3,h3,c3
1 d2,l2,h2,c2,d3,l3,h3,c3,d4,l4,h4,c4
2 d3,l3,h3,c3,d4,l4,h4,c4,d5,l5,h5,c5
3 d4,l4,h4,c4,d5,l5,h5,c5,d6,l6,h6,c6
4 d5,l5,h5,c5,d6,l6,h6,c6,d7,l7,h7,c7
dtype: object
I'm want to create a dataframe, out of arrays with different size. I want to fill the missing values depending on similar values.
I've tried to stick the arrays together and do a sort and a split with numpy. I've then calculate the mean of the splits and decide wether its a value close to the mean or its better fill with nan.
def find_nearest(array, value):
array = np.asarray(array)
idx = (np.abs(array - value)).argmin()
return idx
#generate sample data
loa = [((np.arange(np.random.randint(1,3),np.random.randint(3,6)))*val).tolist()
for val in np.random.uniform(0.9,1.1,5)]
#reshape
flat_list = sum(loa,[])
#add some attributes
attributes = [np.random.randint(-3,-1) for x in range(len(flat_list))]
#sort and split on percentage change
flat_list.sort()
arr = np.array(flat_list)
arr_splits = np.split(arr, np.argwhere(np.diff(arr)/arr[1:]*100 > 12)[:,0])
#means of the splits
means = [np.mean(arr) for arr in arr_splits]
#create dataframe
i = 0
res = np.zeros((len(loa), len(means)*2))*np.nan
for row, l in enumerate(loa):
for val in l:
col = find_nearest(means, val)
res[row, col] = val
res[row, col+len(means)] = attributes[i]
i = i + 1
df = pd.DataFrame(res)
Is there another way, to do this stuff more directly with pandas? ... or something more elegant?
I am using Python 2.7 on Windows 10 and the Spyder Python IDE
I am trying to calculate posterior conditional probabilities of reaching any node in a network from any other node. The network is defined by a dataframe where each row is a directional connection (called edge in graph theory) between fld1 and fld2, and value is the probability of moving from fld1 to fld2.
In order to calculate the probabilities I need to loop through the dataframe. I am using iterrows from pandas but I am also implementing a while loop for capturing indirect paths from one node to another.
My code is below. My question is, is my code correct in the fact that I can use pandas iterrows and a while loop?
import pandas as pd
#from itertools import combinations
from itertools import permutations
df = pd.DataFrame({'fld1': ['apple', 'apple', 'bear','bear','car','car','car','dee','dee','eagle','eagle']
, 'fld2': ['bear', 'car', 'car','eagle','bear','dee','eagle','eagle','foo','dee','foo']
, 'value': [.3,.3,.2,.1,.3,.3,.2,.4,.1,.3,.2]})
## define global objects
#starter value holders
og_fld1_val = []
og_fld2_val = []
og_r_val = []
#df of already checked r_vals
dnc_df = pd.DataFrame(columns = ['fld1','fld2','distance'])
##df of all r_vals to find
flds = pd.Series(df.fld1.unique())
flds = pd.Series(flds.append(pd.Series(df.fld2.unique())).unique())
combos = []
for L in range(0, len(flds)+1):
for subset in permutations(flds, L):
if len(subset) == 2:
combos.append(subset)
rel_df = pd.DataFrame.from_records(data = combos, columns = ['fld1','fld2'])
####for all rows of df
#for each fld1-fld2 relationship in df
# aka (each edge in the network, starting with a-b)
for index, row in df.iterrows():
#take row 1 info for fld1 and fld2 seperately
og_fld1_val = df.fld1[index]
og_fld2_val = df.fld2[index]
og_r_val = df.value[index]
#add info to do not try again list
dnc_df.set_value(index, 'fld1', og_fld1_val)
dnc_df.set_value(index, 'fld2', og_fld2_val)
#variable value holders
#fld1_val = []
#fld2_val = []
#r_val = []
###fld1 has been established now for each path from fld1 outwards
for index, row in df.loc[df.fld1 == og_fld1_val].iterrows():
#see next connection that is not the terminal node
while og_fld2_val <> df.loc[df.fld1 == og_fld1_val].fld2[index]:
#capture relationship between previous node and next node
try:
r_val
except:
r_val = df.loc[df.fld1 == og_fld1_val].value[index]
else:
r_val = r_val * df.loc[df.fld1 == og_fld1_val].value[index]
#if r_val in globals():
# r_val = r_val * df.loc[df.fld1 == og_fld1_val].value[index]
#else:
# r_val = df.loc[df.fld1 == og_fld1_val].value[index]
if r_val < 0.001:
continue
My goal is to create the r_val column so that df becomes df2. In reality my dataset is massive (500K+ rows) and this is just a sample dataset.
df2 = pd.DataFrame({'fld1': ['apple', 'apple', 'bear','bear','car','car','car','dee','dee','eagle','eagle']
, 'fld2': ['bear', 'car', 'car','eagle','bear','dee','eagle','eagle','foo','dee','foo']
, 'value': [.3,.3,.2,.1,.3,.3,.2,.4,.1,.3,.2]
, 'r_val': [.39,.36,.2,.164,.3,.369,.35,.4,.18,.3,.23]})
import pandas as pd
df = pd.DataFrame({'fld1': ['apple', 'apple', 'bear','bear','car','car','car','dee','dee','eagle','eagle']
, 'fld2': ['bear', 'car', 'car','eagle','bear','dee','eagle','eagle','foo','dee','foo']
, 'value': [.3,.3,.2,.1,.3,.3,.2,.4,.1,.3,.2]})
gsums = df.groupby("fld1").sum() # source group sums
df.set_index("fld1", inplace=True) # set index to source column
df["sums"] = gsums # new column sums in dataframe for next operation
df["rval"] = df["value"] / df["sums"] # divide the columns
df.drop("sums", axis=1, inplace=True) # drop the sums column
df.reset_index(inplace=True) # reset index to the original
But, it would have be easier if you had your transition likelihoods/probabilities stored in an n-by-n frame. Then you could do for example:
import pandas as pd
from numpy.random import rand
vars = ("fld1", "fld2", "fld3")
n = len(vars)
df = pd.DataFrame(rand(n, n), index=vars, columns=vars)
dfprobs = df/df.sum(axis=0) # divide by sum of rows, or axis=1 to divide by sum of columns
Also for python graphs I recommend looking on igraph and networkx.
I have a two dimensional (or more) pandas DataFrame like this:
>>> import pandas as pd
>>> df = pd.DataFrame([[0,1],[2,3],[4,5]], columns=['A', 'B'])
>>> df
A B
0 0 1
1 2 3
2 4 5
Now suppose I have a numpy array like np.array([2,3]) and want to check if there is any row in df that matches with the contents of my array. Here the answer should obviously true but eg. np.array([1,2]) should return false as there is no row with both 1 in column A and 2 in column B.
Sure this is easy but don't see it right now.
Turns out it is really easy, the following does the job here:
>>> ((df['A'] == 2) & (df['B'] == 3)).any()
True
>>> ((df['A'] == 1) & (df['B'] == 2)).any()
False
Maybe somebody comes up with a better solution which allows directly passing in the array and the list of columns to match.
Note that the parenthesis around df['A'] == 2 are not optional since the & operator binds just as strong as the == operator.
an easier way is:
a = np.array([2,3])
(df == a).all(1).any()
If you also want to return the index where the matches occurred:
index_list = df[(df['A'] == 2)&(df['B'] == 3)].index.tolist()
To find rows where a single column equals a certain value:
df[df['column name'] == value]
To find rows where multiple columns equal different values, Note the inner ():
df[(df["Col1"] == Value1 & df["Col2"] == Value2 & ....)]
a simple solution with dictionary
def check_existance(dict_of_values, df):
v = df.iloc[:, 0] == df.iloc[:, 0]
for key, value in dict_of_values.items():
v &= (df[key] == value)
return v.any()
import pandas as pd
df = pd.DataFrame([[0,1],[2,3],[4,5]], columns=['A', 'B'])
this_row_exists = {'A':2, 'B':3}
check_existance(this_row_exists, df)
# True
this_row_does_not_exist = {'A':2, 'B':5}
check_existance(this_row_does_not_exist, df)
# False
An answer that works with larger dataframes so you don't need to manually check for each columns:
import pandas as pd
import numpy as np
#define variables
df = pd.DataFrame([[0,1],[2,3],[4,5]], columns=['A', 'B'])
a = np.array([2,3])
def check_if_np_array_is_in_df(df, a):
# transform a into a dataframe
da = pd.DataFrame(np.expand_dims(a,axis=0), columns=['A','B'])
# drop duplicates from df
ddf=df.drop_duplicates()
result = pd.concat([ddf,da]).shape[0] - pd.concat([ddf,da]).drop_duplicates().shape[0]
return result
print(check_if_np_array_is_in_df(df, a))
print(check_if_np_array_is_in_df(df, [1,3]))
If you want to return the row where the matches occurred:
resulting_row = df[(df['A'] == 2)&(df['B'] == 3)].values
I have 2 pandas data frames df and df_min. I apply some filters to df, which results in a single row of data, and I'd like to append that row to df_min. I tried using a loop to traverse df, and tried using loc to append the row to df_min. I keep getting a Incompatible indexer with DataFrame ValueError for the line where I use loc. I guess I am not using loc correctly. What would be the best way to accomplish what I am trying to do?
i = 0
for elem in vehicles:
for state in limit_states:
a = df[(df.VEHICLE == elem) & (df.LIMIT_STATE == state)]
df_min.loc[i] = a[(a.RF == np.min(a.RF))].head(1) #results in a single row
i = i + 1
EDIT: I also tried the following instead of loc, but got the same error:
df_min.ix[i] = a[(a.RF == np.min(a.RF))].head(1)
EDIT 2: Tried the following, got a "first argument must be a list-like of pandas objects, you passed an object of type "DataFrame"" error this time.
for elem in vehicles:
for state in limit_states:
a = df[(df.VEHICLE == elem) & (df.LIMIT_STATE == state)]
df_min = pd.concat(a[(a.RF == np.min(a.RF))].head(1))
probably something like this would be helpful:
df_min = pd.concat([ df[(df.VEHICLE == elem) & (df.LIMIT_STATE == state)]
for elem in vehicles for state in limit_states ])
edit:
xs = [ df[(df.VEHICLE == elem) & (df.LIMIT_STATE == state)]
for elem in vehicles for state in limit_states ]
df_min = pd.concat([ a[(a.RF == np.min(a.RF))].head(1) for a in xs ])
depending on lists vehicles and limit_states you probably can also achieve what you are trying to do using groupby; something like:
fn = lambda a: a[(a.RF == np.min(a.RF))].head(1)
df.groupby( ['VEHICLE', 'LIMIT_STATE'] ).apply( fn )