Melt dataframe based on condition

Melt dataframe based on condition - python

d = {'key': [1,2,3], 'a': [True,True, False], 'b': [False,False,True]}
df = pd.DataFrame(d)
Current melt function is:
df2 = df.melt(id_vars=['key'], var_name = 'letter', value_name = 'Bool')
df2 = df2.query('Bool == True')
Is there a way to incorporate that 'True' condition in the melt function. As I continue to add entries to my df and I have hundreds of columns, I assume it's much less costly to pull only the values I need instead of melting the entire df and then filtering. Any ideas?

Use pd.melt instead. Factor in replacement of False with NaN and dropna() eventually.
pd.melt(df.replace(False, np.nan), id_vars=['key'],var_name = 'letter', value_name = 'Bool').dropna()
key letter Bool
0 1 a True
1 2 a True
5 3 b True

You can filter the non key cols first, melt the results and concat the melted rows back. See the following;
import pandas as pd
import numpy as np
import time
d = {'key': [1,2,3], 'a': [True,True, False], 'b': [False,False,True]}
df = pd.DataFrame(d)
start_time = time.time()
key_column_name = 'key'
key_column_loc = list(df.columns).index(key_column_name)
filtered_frame = None
for letter in [s for s in list(df.columns) if s != key_column_name]:
true_booleans = np.nonzero(df[letter].values)[0]
melted_df = df.iloc[true_booleans][[key_column_name, letter]].reset_index(drop=True).melt(id_vars=[key_column_name], var_name = 'letter', value_name = 'Bool')
if filtered_frame is None:
filtered_frame = melted_df
else:
filtered_frame = pd.concat((filtered_frame, melted_df), axis = 0)
end_time = time.time()
print(filtered_frame, '\n\n', end_time - start_time, 'seconds!')
Output
key letter Bool
0 1 a True
1 2 a True
0 3 b True
0.011133432388305664 seconds!
Compared to your code, it is slower (your score is 0.008090734481811523 seconds!), however as the rows increase, I would expect that above way of doing it will be more efficient. Looking forward for the results.
Regarding the discussion on speed (Benchmarks)
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
# Benchmark Tests
d = {'key': [1,2,3], 'a': [True,True, False], 'b': [False,False,True]}
df_initial = pd.DataFrame(d)
data_size = [10, 100, 10000, 50000, 100000, 500000, 1000000, 5000000, 10000000, 50000000]
scores_current = []
scores_golden_lion = []
scores_sammywemmy = []
scores_wwnde = []
scores_slybot = []
for n_rows in data_size:
df = df_initial.sample(n=n_rows, replace=True).reset_index(drop=True)
## #Current method
start_time = time.time()
df_current = df.melt(id_vars=['key'], var_name = 'letter', value_name = 'Bool')
df_current = df_current.query('Bool == True')
end_time = time.time()
scores_current.append(end_time-start_time)
## #Golden Lion
start_time = time.time()
df_golden_lion = df.melt(id_vars=['key'], var_name = 'letter', value_name = 'Boolean')
df_golden_lion= df_golden_lion.drop(df_golden_lion.index[df_golden_lion['Boolean'] == False])
end_time = time.time()
scores_golden_lion.append(end_time-start_time)
## #sammywemmy
start_time = time.time()
box = df.iloc[:, 1:]
len_df = len(df)
letters = np.tile(box.columns, (len_df,1))[box]
df_sammywemmy = pd.DataFrame({'key':df.key.array,
'letter' : letters,
'Bool' : [True]*len_df})
end_time = time.time()
scores_sammywemmy.append(end_time-start_time)
## #wwnde
start_time = time.time()
df_wwnde = pd.melt(df.replace(False, np.nan), id_vars=['key'],var_name = 'letter', value_name = 'Bool').dropna()
end_time = time.time()
scores_wwnde.append(end_time-start_time)
## #Slybot
start_time = time.time()
key_column_name = 'key'
key_column_loc = list(df.columns).index(key_column_name)
filtered_frame = None
for letter in [s for s in list(df.columns) if s != key_column_name]:
true_booleans = np.nonzero(df[letter].values)[0]
melted_df = df.iloc[true_booleans][[key_column_name, letter]].melt(id_vars=[key_column_name], var_name = 'letter', value_name = 'Bool')
if filtered_frame is None:
filtered_frame = melted_df
else:
filtered_frame = pd.concat((filtered_frame, melted_df), axis = 0)
end_time = time.time()
scores_slybot.append(end_time-start_time)
plt.plot(data_size, scores_current, label = "Current method")
plt.plot(data_size, scores_golden_lion, label = "Golden Lion")
plt.plot(data_size, scores_sammywemmy, label = "sammywemmy")
plt.plot(data_size, scores_wwnde, label = "wwnde")
plt.plot(data_size, scores_slybot, label = "Slybot")
plt.legend()
plt.show()
Interesting to see that none of the other answers can beat the originally suggested method with a dataset of 500,000 rows! Until 200,000 rows sammywemmy's method is a clear winner though.

The melt and filter step is efficient though, I'd probably stick with loc instead of query, especially if your data is not that large (<200_000 rows)
Another option is to skip melt, use numpy, and build a new dataframe:
box = df.iloc[:, 1:]
len_df = len(df)
letters = np.tile(box.columns, (len_df,1))[box]
pd.DataFrame({'key':df.key.array,
'letter' : letters,
'Bool' : [True]*len_df})
key letter Bool
0 1 a True
1 2 a True
2 3 b True

melt moves column data and stacks it vertically resulting in two columns: the variable name of the column being stacked and the value column name.
d = {'key': [1,2,3], 'a': [True,True, False], 'b': [False,False,True],'c':['Batchelor','Masters','Doctorate']}
df = pd.DataFrame(d)
df2 = df.melt(id_vars=['key'], var_name = 'letter', value_name = 'Boolean')
df2=df2.drop(df2.index[df2['Boolean'] == False])
print(df2)
output
key letter Boolean
0 1 a True
1 2 a True
5 3 b True
6 1 c Batchelor
7 2 c Masters
8 3 c Doctorate

Related

How can I optimize this Python Pandas code?

The following function takes market data (OHLCV candles) and tries to find periods where the price oscillates between two bounds (consolidation zones). I wrote it by translating, from Pine Script to Python, an open-source indicator found on TradingView.
The function works, in the sense that it finds correctly consolidation zones. But problem is performance, mainly due to the for cycle at the end: with 30K candles it takes just ~5 seconds to execute the code before the for, and then it takes over 2 minutes to run the cycle.
import numpy as np
import pandas as pd
from pandas import (DataFrame, Series)
def _find_zz(row: Series):
if pd.notnull(row['hb']) and pd.notnull(row['lb']):
if row['dir'] == 1:
return row['hb']
else:
return row['lb']
else:
return row['hb'] if pd.notnull(row['hb']) else row['lb'] if pd.notnull(row['lb']) else np.NaN
def consolidation_zones(dataframe: DataFrame, timeperiod: int = 100,
minlength: int = 20) -> DataFrame:
rolling = dataframe.rolling(timeperiod, min_periods=1)
idxmax = rolling['high'].apply(lambda x: x.idxmax()).astype(int)
idxmin = rolling['low'].apply(lambda x: x.idxmin()).astype(int)
highest = pd.concat({'value': dataframe['high'], 'offset': dataframe.index - idxmax}, axis=1)
lowest = pd.concat({'value': dataframe['low'], 'offset': dataframe.index - idxmin}, axis=1)
hb = highest.apply(lambda x: x['value'] if x['offset'] == 0 else np.NaN, axis=1)
lb = lowest.apply(lambda x: x['value'] if x['offset'] == 0 else np.NaN, axis=1)
direction = pd.concat({'hb': hb, 'lb': lb}, axis=1).apply(lambda x: 1 if pd.notnull(x['hb']) and pd.isnull(x['lb']) else -1 if pd.isnull(x['hb']) and pd.notnull(x['lb']) else np.NaN, axis=1).fillna(method='ffill').fillna(0).astype(int)
zz = pd.concat({'hb': hb, 'lb': lb, 'dir': direction}, axis=1).apply(_find_zz, axis=1)
group = direction.ne(direction.shift()).cumsum()
zzdir = pd.concat({'zz': zz, 'dir': direction, 'group': group}, axis=1)
zzdir['min'] = zzdir.groupby('group')['zz'].cummin().fillna(method='ffill')
zzdir['max'] = zzdir.groupby('group')['zz'].cummax().fillna(method='ffill')
zzdir['pp'] = np.NaN
pp = Series(np.where(zzdir['dir'] == 1, zzdir['max'], np.where(zzdir['dir'] == -1, zzdir['min'], zzdir['pp'])))
H = dataframe.rolling(minlength, min_periods=1)['high'].max()
L = dataframe.rolling(minlength, min_periods=1)['low'].min()
prevpp = np.NaN
conscnt = 0
condhigh = np.NaN
condlow = np.NaN
zones = DataFrame(index=dataframe.index, columns=['upper_bound', 'lower_bound'])
indexes = [] # will keep indexes of candles that are part of the consolidation
#----------------
for index, value in pp.items():
# pp is a value computed before: when it changes, it *may* be the end of a consolidation zone
if value != prevpp:
if conscnt > 0 and value <= condhigh and value >= condlow:
# if condlow <= pp <= condhigh, we are still in consolidation
conscnt = conscnt + 1
indexes.append(index)
else: # end of consolidation
conscnt = 0
indexes = []
else:
conscnt = conscnt + 1
indexes.append(index)
if conscnt >= minlength:
if conscnt == minlength:
# initially, condhigh/low is equal to the highest/lowest value in last minlength candles
condhigh = H.get(index)
condlow = L.get(index)
else:
# update condhigh/low with new high/low
condhigh = max(condhigh, dataframe.loc[index, 'high'])
condlow = min(condlow, dataframe.loc[index, 'low'])
zones.loc[zones.index.isin(indexes), 'upper_bound'] = condhigh
zones.loc[zones.index.isin(indexes), 'lower_bound'] = condlow
prevpp = value
#----------------
return zones
I don't know how to write the last part of the code (delimited by comments) without iterating over all the rows.
This is the original Pine Script: Consolidation Zones - Live - TradingView

python panda find next higher and next lower number

I want to find the next value in 'y' to my val_test. In this exampel I want
next_up = 4
next_down = 6
( up and down is in the cooridinat system, not the number value )
val_test = 5
df = pd.DataFrame({'x': [1,1,1,2,2,3,3,3,4,4,4],
'y': [1,2,3,4,5,6,7,15,21,8,9]})
df=df.sort_values('y',ascending = True)
next_up = int(df.y.iloc[np.searchsorted(df.y.values,[val_test])])
df=df.sort_values('y', ascending = False)
next_down = int(df.y.iloc[np.searchsorted(df.y.values,[val_test])])
print('next_up = ', next_up)
print('next_down = ',next_down)

Perhaps you could stay only within Pandas.
There is no need to sort twice or use Numpy.
import pandas as pd
val_test = 5
col = 'y'
df = pd.DataFrame({
'x': [1,1,1,2,2,3,3,3,4,4,4],
'y': [1,2,3,4,5,6,7,15,21,8,9]})
# default from sort_values is ascending=True
# you create a sorted dataframe in which the index correspond to the ascending order
df_sorted = df.sort_values(col).reset_index(drop=True)
# you find the id in your sorted dataframe that correspond to your val_test
idx_val_test = df_sorted.loc[df_sorted[col]==val_test].index[0]
# you select the next up value in this sorted dataframe
next_up = df_sorted[col].iloc[idx_val_test+1]
# you select the next down value in this sorted dataframe
next_down = df_sorted[col].iloc[idx_val_test-1]
print('next_up = ', next_up)
print('next_down = ', next_down)

I can suggest 2 cases depending on what you need, for example :
CASE 1 : You can validate of real position keeping in mind the boundaries
df = pd.DataFrame({'x': [1,1,1,2,2,3,3,3,4,4,4],
'y': [1,2,3,4,5,6,7,15,21,8,9]})
val_test = 9
for i in range(len(lista)):
if i==0 :
if lista[i]==val_test :
next_up=lista[i+1]
next_down='Firts number'
elif i != 0:
if lista[i]==val_test :
try :
next_up=lista[i+1]
except :
next_up='Last number'
try :
next_down=lista[i-1]
except :
next_down='Firts number'
break
print('next_up = ', next_up)
print('next_down = ',next_down)
OUTPUT CASE #1
val_test = 9
next_up = Last number
next_down = 8
# but if you use
val_test = 5
next_up = 6
next_down = 4
Cool !!
CASE 2 : You can have problems with the first and last index althougt you can use
val_test = 15
df = pd.DataFrame({'x': [1,1,1,2,2,3,3,3,4,4,4],
'y': [1,2,3,4,5,6,7,15,21,8,9]})
next_up = int(df.y.iloc[np.searchsorted(df['y'].values,[val_test])+1])
next_down = int(df.y.iloc[np.searchsorted(df['y'].values,[val_test])-1])
print('next_up = ', next_up)
print('next_down = ',next_down)
OUTPUT CASE #2
val_test = 15
next_up = 21
next_down = 7
But you can have this problem when using val_test=9
val_test = 9
next_up = 21
next_down = 7
😓

How to store the results of each iteration of for loop in a dataframe

cols = Germandata.columns
percentage_list = [0.05,0.01,0.1]
for i in range(len(Germandata)) :
for percentage in percentage_list:
columns_n = 3
random_columns = np.random.choice(cols, columns_n, replace=False)
local_data = Germandata.copy()
remove_n = int(round(local_data.shape[0] * percentage, 0))
for column_name in random_columns:
drop_indices = np.random.choice(local_data.index, remove_n, replace=False)
local_data.loc[drop_indices, column_name] = np.nan
The code here selects the columns at random and will delete certain percentage of observations from the data and it will replace them with NANs. The problem here is after running the loop i will get the final percentage deleted dataframe in the percentage list because it is overwriting after each iteration. How to store the dataframe with nans after each iteration.? Ideally i should get three dataframes with different percent of data deleted.

Try this
df_list = []
cols = Germandata.columns
percentage_list = [0.05,0.01,0.1]
for percentage in percentage_list:
columns_n = 3
random_columns = np.random.choice(cols, columns_n, replace=False)
local_data = Germandata.copy()
remove_n = int(round(local_data.shape[0] * percentage, 0))
for column_name in random_columns:
drop_indices = np.random.choice(local_data.index, remove_n, replace=False)
local_data.loc[drop_indices, column_name] = np.nan
local_data['percentage'] = percentage # optional
df_list.append(local_data)
df_05 = df_list[0]
df_01 = df_list[1]
df_1 = df_list[2]
Alternatively, you can use a dictionary
df_dict = {}
cols = Germandata.columns
percentage_list = [0.05,0.01,0.1]
for percentage in percentage_list:
columns_n = 3
random_columns = np.random.choice(cols, columns_n, replace=False)
local_data = Germandata.copy()
remove_n = int(round(local_data.shape[0] * percentage, 0))
for column_name in random_columns:
drop_indices = np.random.choice(local_data.index, remove_n, replace=False)
local_data.loc[drop_indices, column_name] = np.nan
local_data['percentage'] = percentage # optional
df_dict[str(percentage)] = local_data
df_05 = df_dict['0.05']
df_01 = df_dict['0.01']
df_1 = df_dict['0.1']

writing function in pandas/python

I have just started to learn python and don't have much of dev background. Here is the code I have written while learning.
I now want to make a function which exactly does what my "for" loop is doing but it needs to calculate different exp(exp,exp1 etc) based on different num(num, num1 etc)
how can I do this?
import pandas as pd
index = [0,1]
s = pd.Series(['a','b'],index= index)
t = pd.Series([1,2],index= index)
t1 = pd.Series([3,4],index= index)
df = pd.DataFrame(s,columns = ["str"])
df["num"] =t
df['num1']=t1
print (df)
exp=[]
for index, row in df.iterrows():
if(row['str'] == 'a'):
row['mul'] = -1 * row['num']
exp.append(row['mul'])
else:
row['mul'] = 1 * row['num']
exp.append(row['mul'])
df['exp'] = exp
print (df)
This is what i was trying to do which gives wrong results
import pandas as pd
index = [0,1]
s = pd.Series(['a','b'],index= index)
t = pd.Series([1,2],index= index)
t1 = pd.Series([3,4],index= index)
df = pd.DataFrame(s,columns = ["str"])
df["num"] =t
df['num1']=t1
def f(x):
exp=[]
for index, row in df.iterrows():
if(row['str'] == 'a'):
row['mul'] = -1 * x
exp.append(row['mul'])
else:
row['mul'] = 1 * x
exp.append(row['mul'])
return exp
df['exp'] = df['num'].apply(f)
df['exp1'] = df['num1'].apply(f)
df
Per suggestion below, I would do:
df['exp']=np.where(df.str=='a',df['num']*-1,df['num']*1)
df['exp1']=np.where(df.str=='a',df['num1']*-1,df['num1']*1)

I think you are looking for np.where
df['exp']=np.where(df.str=='a',df['num']*-1,df['num']*1)
df
Out[281]:
str num num1 exp
0 a 1 3 -1
1 b 2 4 2

Normal dataframe operation:
df["exp"] = df.apply(lambda x: x["num"] * (1 if x["str"]=="a" else -1), axis=1)
Mathematical dataframe operation:
df["exp"] = ((df["str"] == 'a')-0.5) * 2 * df["num"]

Take n rows from a spark dataframe and pass to toPandas()

I have this code:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.withColumn('age2', df.age + 2).toPandas()
Works fine, does what it needs to. Suppose though I only want to display the first n rows, and then call toPandas() to return a pandas dataframe. How do I do it? I can't call take(n) because that doesn't return a dataframe and thus I can't pass it to toPandas().
So to put it another way, how can I take the top n rows from a dataframe and call toPandas() on the resulting dataframe? Can't think this is difficult but I can't figure it out.
I'm using Spark 1.6.0.

You can use the limit(n) function:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.limit(2).withColumn('age2', df.age + 2).toPandas()
Or:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.withColumn('age2', df.age + 2).limit(2).toPandas()

You could get first rows of Spark DataFrame with head and then create Pandas DataFrame:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df_pandas = pd.DataFrame(df.head(3), columns=df.columns)
In [4]: df_pandas
Out[4]:
name age
0 Alice 1
1 Jim 2
2 Sandra 3

Try it:
def showDf(df, count=None, percent=None, maxColumns=0):
if (df == None): return
import pandas
from IPython.display import display
pandas.set_option('display.encoding', 'UTF-8')
# Pandas dataframe
dfp = None
# maxColumns param
if (maxColumns >= 0):
if (maxColumns == 0): maxColumns = len(df.columns)
pandas.set_option('display.max_columns', maxColumns)
# count param
if (count == None and percent == None): count = 10 # Default count
if (count != None):
count = int(count)
if (count == 0): count = df.count()
pandas.set_option('display.max_rows', count)
dfp = pandas.DataFrame(df.head(count), columns=df.columns)
display(dfp)
# percent param
elif (percent != None):
percent = float(percent)
if (percent >=0.0 and percent <= 1.0):
import datetime
now = datetime.datetime.now()
seed = long(now.strftime("%H%M%S"))
dfs = df.sample(False, percent, seed)
count = df.count()
pandas.set_option('display.max_rows', count)
dfp = dfs.toPandas()
display(dfp)
Examples of usages are:
# Shows the ten first rows of the Spark dataframe
showDf(df)
showDf(df, 10)
showDf(df, count=10)
# Shows a random sample which represents 15% of the Spark dataframe
showDf(df, percent=0.15)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Melt dataframe based on condition - python

Use pd.melt instead. Factor in replacement of False with NaN and dropna() eventually. pd.melt(df.replace(False, np.nan), id_vars=['key'],var_name = 'letter', value_name = 'Bool').dropna() key letter Bool 0 1 a True 1 2 a True 5 3 b True

Related

How can I optimize this Python Pandas code?

python panda find next higher and next lower number

How to store the results of each iteration of for loop in a dataframe

writing function in pandas/python

Take n rows from a spark dataframe and pass to toPandas()

Categories

Resources