I have a data frame as follows
I/P
date,low,high,close
d1,l1,h1,c1
d2,l2,h2,c2
d3,l3,h3,c3
d4,l4,h4,c4
d5,l5,h5,c5
d6,l6,h5,c5
d7,l7,h7,c7
O/P
d1,l1,h1,c1,d2,l2,h2,c2,d3,l3,h3,c3
d2,l2,h2,c2,d3,l3,h3,c3,d4,l5,h4,c4
d3,l3,h3,c3,d4,l5,h4,c4,d5,l5,h5,c5
d4,l5,h4,c4,d5,l5,h5,c5,d6,l6,h6,c6
....
Basically join all rows, split into subarrays of 3 size each staring at each index, and create the op data frame.
Following code works. Buts its too verbose and slow. Does pandas have something inbuilt for this?
def flatten(df):
candles = []
i = 0
while i < len(df):
candles.append(df.iloc[i])
i= i+1
return candles
def slide_and_expand(candles, k):
return [candles[i:i+k] for i in range(len(candles) - k + 1)]
def candle_to_dict(col_name_prefix, candle_series):
candle_dict = {}
for index, val in candle_series.iteritems():
col_name = col_name_prefix+index
candle_dict[col_name] = val
return candle_dict
def candle_group_to_feature_vector(candle_group):
feature_vector_dict = {}
i = 0
for candle in candle_group:
col_name_prefix = f"c{i}_"
candle_dict = candle_to_dict(col_name_prefix, candle)
feature_vector_dict.update(candle_dict)
i= i+1
return feature_vector_dict
def candle_groups_to_feature_vectors(candle_groups):
feature_vectors = []
for candle_group in candle_groups:
feature_vector = candle_group_to_feature_vector(candle_group)
feature_vectors.append(feature_vector)
return feature_vectors
fv_len = 3
candles = flatten(data)
candle_groups = slide_and_expand(candles,fv_len)
feature_vectors = candle_groups_to_feature_vectors(candle_groups)
data_fv = pd.DataFrame.from_dict(feature_vectors, orient='columns')
data_fv
You could do something like this:
n = len(df.index) # number of rows in original dataframe 'df'
df_0 = df.loc[0:n-3]
df_1 = df.loc[1:n-2]
df_2 = df.loc[2:n-1]
df_final = pandas.concat([df_0, df_1, df_2], axis = 1)
You can save a few steps using Pandas rolling function using the windows size as the desired subarray length (window=SUBARR_SZ). Then, join each column with a ,, transform the result to a Series to be able to apply a join again, but now using each row in the Series (which contains the specific amount of subarrays).
import pandas as pd
df = pd.read_csv('sample.csv')
SUBARR_SZ = 3 # subarray size
df_list = []
for w in df.rolling(window=SUBARR_SZ):
if len(w) == SUBARR_SZ:
s = w.apply(','.join, axis=1).apply(pd.Series).apply(','.join)
df_list.append(s)
dff = pd.concat(df_list).reset_index(drop=True)
print(dff)
Output from dff
0 d1,l1,h1,c1,d2,l2,h2,c2,d3,l3,h3,c3
1 d2,l2,h2,c2,d3,l3,h3,c3,d4,l4,h4,c4
2 d3,l3,h3,c3,d4,l4,h4,c4,d5,l5,h5,c5
3 d4,l4,h4,c4,d5,l5,h5,c5,d6,l6,h6,c6
4 d5,l5,h5,c5,d6,l6,h6,c6,d7,l7,h7,c7
dtype: object
Related
Input
mydfs= [df1,df2,df3,df4,df5,df6,df7,df8,df9]
My Code
import pandas as pd
df_1 = pd.concat([mydfs[0],mydfs[1],mydfs[2]])
df_m = df_1.merge(mydfs[2])
df_2 = pd.concat([mydfs[3],mydfs[4],mydfs[5]])
df_m1 = df_2.merge(mydfs[5])
df_3 = pd.concat([mydfs[6],mydfs[7],mydfs[8]])
df_m2 = df_3.merge(mydfs[8])
But I want my code dynamic way instead of doing manually,
using for loop is it possible? may be in future the list of data frames will increase
You can use a dictionary comprehension:
N = 3
out_dfs = {f'df_{i//N+1}': pd.concat(mydfs[i:i+N])
for i in range(0, len(mydfs), N)}
output:
{'df_1': <concatenation result of ['df1', 'df2', 'df3']>,
'df_2': <concatenation result of ['df4', 'df5', 'df6']>,
'df_3': <concatenation result of ['df7', 'df8', 'df9']>,
}
You can use a loop with "globals" to iterate through mydfs and define two "kth" variables each round
i = 0
k = 1
while i < len(mydfs):
globals()["df_{}".format(k)] = pd.concat([mydfs[i],mydfs[i+1],mydfs[i+2]])
globals()["df_m{}".format(k)] = globals()["df_{}".format(k)].merge(mydfs[i+2])
i = i+3
k = k+1
I'm want to create a dataframe, out of arrays with different size. I want to fill the missing values depending on similar values.
I've tried to stick the arrays together and do a sort and a split with numpy. I've then calculate the mean of the splits and decide wether its a value close to the mean or its better fill with nan.
def find_nearest(array, value):
array = np.asarray(array)
idx = (np.abs(array - value)).argmin()
return idx
#generate sample data
loa = [((np.arange(np.random.randint(1,3),np.random.randint(3,6)))*val).tolist()
for val in np.random.uniform(0.9,1.1,5)]
#reshape
flat_list = sum(loa,[])
#add some attributes
attributes = [np.random.randint(-3,-1) for x in range(len(flat_list))]
#sort and split on percentage change
flat_list.sort()
arr = np.array(flat_list)
arr_splits = np.split(arr, np.argwhere(np.diff(arr)/arr[1:]*100 > 12)[:,0])
#means of the splits
means = [np.mean(arr) for arr in arr_splits]
#create dataframe
i = 0
res = np.zeros((len(loa), len(means)*2))*np.nan
for row, l in enumerate(loa):
for val in l:
col = find_nearest(means, val)
res[row, col] = val
res[row, col+len(means)] = attributes[i]
i = i + 1
df = pd.DataFrame(res)
Is there another way, to do this stuff more directly with pandas? ... or something more elegant?
Hello there I would like to iterate over the row CPB% and add the computations to a related column called 'Proba'. My dataframe looks like this:
What I tried so far looks like this:
bins = np.linspace(0, 1, num=100)
dCPB = df['CPB%']
df['binnedB'] = pd.cut(dCPB, bins)
dfnew = pd.DataFrame(pd.cut(df['CPB%'], bins=bins).value_counts()).sort_index(ascending = True)
dfnew['binned'] = dfnew.index
total = dfnew['CPB%'].sum()
idx = total
for index,row in dfnew.iterrows():
idx = idx - row['CPB%']
row['Proba'] = float(idx) / float(total)
But my iteration does not update my empty column Proba, any idea why? Thanks!
I think the problem is, you are assigning the result back to the row, which doesn't get stored anywhere. instead you can do:
proba = []
for index, row in dfnew.iterrows():
idx = idx - row['CPB%']
proba.append(float(idx) / float(total))
dfnew['Proba'] = proba
However, this is not the best way, you can use .apply with axis=1 to do row-wise calculations on a data frame.
You can use pd.Series.cumsum to perform your iterative deductions:
total = dfnew['CPB%'].sum()
dfnew['Proba'] = 1 - df['CPB%'].cumsum() / total
With Pandas you should look to vectorise algorithms, which usually involves column-wise operations as opposed to a row-wise for loop. Here's a complete demonstration:
df = pd.DataFrame({'A': list(range(1, 7))})
def jpp(df):
total = df['A'].sum()
df['Proba'] = 1 - df['A'].cumsum() / total
return df
def yolo(df):
total = df['A'].sum()
idx = total
proba = []
for index, row in df.iterrows():
idx = idx - row['A']
proba.append(float(idx) / float(total))
df['Proba'] = proba
return df
# check results are the same
assert df.pipe(jpp).equals(df.pipe(yolo))
%timeit df.pipe(jpp) # 691 µs
%timeit df.pipe(yolo) # 840 µs
I'm creating a dataframe the following way:
filtered_data.groupby('weekday').agg({'airing': np.sum, 'uplift': [np.sum,np.mean]})
Which creates a table of:
sum sum mean
weekday
1 11 20 1.818182
2 24 46 1.916667
...
What I want is to include a final row which is the total for each column.
Thanks in advance!
You can use the .loc function in order to achieve that:
df.loc[len(df)] = [df[col].sum() for col in df.columns]
In this case, you should probably create a Series that keeps track of your summary stats. You could then concat if you needed to for display purposes.
summary = pd.Series([filtered_data.airing.sum(),
filtered_data.uplift.sum(),
filtered_data.uplift.mean()],
name='summary')
For this purpose I created an aggregation tool which behaves like GROUPING SETS in SQL. Supply columns by which to group and an aggregate function, and get back an aggregated DataFrame.
import itertools as it
import pandas as pd
from pandas.util.testing import assert_frame_equal
def powerset(iterable):
"powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
s = list(iterable)
return it.chain.from_iterable(it.combinations(s,r) for r in range(len(s)+1))
def grouper(df,grpby,aggfunc):
''' produces aggregate DataFrame from DataFrames for non-redundant groupings
`workingdf` is used to avoid modifying original DataFrame
'''
uniqcols = set(col for col in grpby if len(df[col].unique()) == 1)
subset = set()
for col in uniqcols:
for grp in powerset(grpby):
if col in grp:
subset.add(grp) # add level of aggregation only when non-redundant
if len(subset) == 0:
for grp in powerset(grpby):
subset.add(grp)
workingdf = df.copy()
for idx,i in enumerate(subset):
if i != ():
tmp = aggfunc( workingdf.groupby(i) )
else:
# hack to get output to be a DataFrameGroupBy object:
# insert dummy column on which to group by
dummycolname = hash(tuple(workingdf.columns.tolist()))
workingdf[dummycolname] = ''
tmp = aggfunc( workingdf.groupby(dummycolname) )
# drop the index and add it back
if i == (): tmp.reset_index(drop=True,inplace=True)
else: tmp.reset_index(inplace=True)
for j in grpby:
if j not in tmp: # if column is not in DataFrame add it
tmp[j] = '(All)'
# new list with all columns including aggregate ones; do this only once
if idx == 0:
finalcols = grpby[:]
addlcols = [k for k in tmp if k not in grpby] # aggregate columns
finalcols.extend(addlcols)
# reorder columns
tmp = tmp[finalcols]
if idx == 0:
final = tmp; del tmp
else:
final = pd.concat( [final,tmp] ); del tmp
del workingdf
final.sort_values(finalcols,inplace=True)
final.reset_index(drop=True,inplace=True)
return final
def agg(grpbyobj):
''' the purpose of this function is to:
specify aggregate operation(s) you wish to perform,
name the resulting column(s) in the final DataFrame.
'''
tmp = pd.DataFrame()
tmp['Total (n)'] = grpbyobj['Total'].sum()
return tmp
if __name__ == '__main__':
df = pd.DataFrame({'Area':['a','a','b',],
'Year':[2014,2014,2014,],
'Month':[1,2,3,],
'Total':[4,5,6,],})
final = grouper(df,grpby=['Area','Year'],aggfunc=agg)
# test against expected result
expected = pd.DataFrame({u'Year': {0: 2014, 1: 2014, 2: 2014},
u'Total (n)': {0: 15, 1: 9, 2: 6},
u'Area': {0: u'(All)', 1: u'a', 2: u'b'}})
expected = expected[final.columns.tolist()]
try:
# check_names kwarg True: compare indexes and columns
assert_frame_equal(final,expected,check_names=True)
except AssertionError as e:
raise
I have two pandas DataFrames A and B, with columns ['start', 'end', 'value'] but not the same number of rows. I'd like to set the values for each row in A as follows:
A.iloc(i) = B['value'][B['start'] < A[i,'start'] & B['end'] > A[i,'end']]
There is a possibility of multiple rows of B satisfy this condition for each i, in that case max or sum of corresponding rows would be the result. In case if none satisfies the value of A.iloc[i] should not be updated or set to a default value of 0 (either way would be fine)
I'm interested to find the most efficient way of doing this.
import numpy as np
np.random.seed(1)
lenB = 10
lenA = 20
B_start = np.random.rand(lenB)
B_end = B_start + np.random.rand(lenB)
B_value = np.random.randint(100, 200, lenB)
A_start = np.random.rand(lenA)
A_end = A_start + np.random.rand(lenA)
#if you use dataframe
#B_start = B["start"].values
#B_end = ...
mask = (A_start[:, None ] > B_start) & (A_end[:, None] < B_end)
r, c = np.where(mask)
result = pd.Series(B_value[c]).groupby(r).max()
print result