writing function in pandas/python - python

I have just started to learn python and don't have much of dev background. Here is the code I have written while learning.
I now want to make a function which exactly does what my "for" loop is doing but it needs to calculate different exp(exp,exp1 etc) based on different num(num, num1 etc)
how can I do this?
import pandas as pd
index = [0,1]
s = pd.Series(['a','b'],index= index)
t = pd.Series([1,2],index= index)
t1 = pd.Series([3,4],index= index)
df = pd.DataFrame(s,columns = ["str"])
df["num"] =t
print (df)
for index, row in df.iterrows():
if(row['str'] == 'a'):
row['mul'] = -1 * row['num']
row['mul'] = 1 * row['num']
df['exp'] = exp
print (df)
This is what i was trying to do which gives wrong results
import pandas as pd
index = [0,1]
s = pd.Series(['a','b'],index= index)
t = pd.Series([1,2],index= index)
t1 = pd.Series([3,4],index= index)
df = pd.DataFrame(s,columns = ["str"])
df["num"] =t
def f(x):
for index, row in df.iterrows():
if(row['str'] == 'a'):
row['mul'] = -1 * x
row['mul'] = 1 * x
return exp
df['exp'] = df['num'].apply(f)
df['exp1'] = df['num1'].apply(f)
Per suggestion below, I would do:

I think you are looking for np.where
str num num1 exp
0 a 1 3 -1
1 b 2 4 2

Normal dataframe operation:
df["exp"] = df.apply(lambda x: x["num"] * (1 if x["str"]=="a" else -1), axis=1)
Mathematical dataframe operation:
df["exp"] = ((df["str"] == 'a')-0.5) * 2 * df["num"]


How can I optimize this Python Pandas code?

The following function takes market data (OHLCV candles) and tries to find periods where the price oscillates between two bounds (consolidation zones). I wrote it by translating, from Pine Script to Python, an open-source indicator found on TradingView.
The function works, in the sense that it finds correctly consolidation zones. But problem is performance, mainly due to the for cycle at the end: with 30K candles it takes just ~5 seconds to execute the code before the for, and then it takes over 2 minutes to run the cycle.
import numpy as np
import pandas as pd
from pandas import (DataFrame, Series)
def _find_zz(row: Series):
if pd.notnull(row['hb']) and pd.notnull(row['lb']):
if row['dir'] == 1:
return row['hb']
return row['lb']
return row['hb'] if pd.notnull(row['hb']) else row['lb'] if pd.notnull(row['lb']) else np.NaN
def consolidation_zones(dataframe: DataFrame, timeperiod: int = 100,
minlength: int = 20) -> DataFrame:
rolling = dataframe.rolling(timeperiod, min_periods=1)
idxmax = rolling['high'].apply(lambda x: x.idxmax()).astype(int)
idxmin = rolling['low'].apply(lambda x: x.idxmin()).astype(int)
highest = pd.concat({'value': dataframe['high'], 'offset': dataframe.index - idxmax}, axis=1)
lowest = pd.concat({'value': dataframe['low'], 'offset': dataframe.index - idxmin}, axis=1)
hb = highest.apply(lambda x: x['value'] if x['offset'] == 0 else np.NaN, axis=1)
lb = lowest.apply(lambda x: x['value'] if x['offset'] == 0 else np.NaN, axis=1)
direction = pd.concat({'hb': hb, 'lb': lb}, axis=1).apply(lambda x: 1 if pd.notnull(x['hb']) and pd.isnull(x['lb']) else -1 if pd.isnull(x['hb']) and pd.notnull(x['lb']) else np.NaN, axis=1).fillna(method='ffill').fillna(0).astype(int)
zz = pd.concat({'hb': hb, 'lb': lb, 'dir': direction}, axis=1).apply(_find_zz, axis=1)
group = direction.ne(direction.shift()).cumsum()
zzdir = pd.concat({'zz': zz, 'dir': direction, 'group': group}, axis=1)
zzdir['min'] = zzdir.groupby('group')['zz'].cummin().fillna(method='ffill')
zzdir['max'] = zzdir.groupby('group')['zz'].cummax().fillna(method='ffill')
zzdir['pp'] = np.NaN
pp = Series(np.where(zzdir['dir'] == 1, zzdir['max'], np.where(zzdir['dir'] == -1, zzdir['min'], zzdir['pp'])))
H = dataframe.rolling(minlength, min_periods=1)['high'].max()
L = dataframe.rolling(minlength, min_periods=1)['low'].min()
prevpp = np.NaN
conscnt = 0
condhigh = np.NaN
condlow = np.NaN
zones = DataFrame(index=dataframe.index, columns=['upper_bound', 'lower_bound'])
indexes = [] # will keep indexes of candles that are part of the consolidation
for index, value in pp.items():
# pp is a value computed before: when it changes, it *may* be the end of a consolidation zone
if value != prevpp:
if conscnt > 0 and value <= condhigh and value >= condlow:
# if condlow <= pp <= condhigh, we are still in consolidation
conscnt = conscnt + 1
else: # end of consolidation
conscnt = 0
indexes = []
conscnt = conscnt + 1
if conscnt >= minlength:
if conscnt == minlength:
# initially, condhigh/low is equal to the highest/lowest value in last minlength candles
condhigh = H.get(index)
condlow = L.get(index)
# update condhigh/low with new high/low
condhigh = max(condhigh, dataframe.loc[index, 'high'])
condlow = min(condlow, dataframe.loc[index, 'low'])
zones.loc[zones.index.isin(indexes), 'upper_bound'] = condhigh
zones.loc[zones.index.isin(indexes), 'lower_bound'] = condlow
prevpp = value
return zones
I don't know how to write the last part of the code (delimited by comments) without iterating over all the rows.
This is the original Pine Script: Consolidation Zones - Live - TradingView

Search through a dataframe using Regex in a for loop to pull out a value associated with the Regex

I have a subset dataframe from a much larger dataframe. I need to be able to create a for loop that searches through a dataframe and pull out the data corresponding to the correct name.
import pandas as pd
import numpy as np
import re
data = {'Name': ['CH_1', 'CH_2', 'CH_3', 'FV_1', 'FV_2', 'FV_3'],
'Value': [1, 2, 3, 4, 5, 6]
df = pd.DataFrame(data)
FL = [17.7, 60.0]
CH = [20, 81.4]
tol = 8
time1 = FL[0] + tol
time2 = FL[1] + tol
time3 = CH[0] + tol
time4 = CH[1] + tol
FH_mon = df['Values'] *5
workpercent = [.7, .92, .94]
mhpy = [2087, 2503, 3128.75]
list1 = list()
list2 = list()
for x in df['Name']:
if x == [(re.search('FV_', s)) for s in df['Name'].values]:
y = np.select([FH_mon < time1 , (FH_mon >= time1) and (FH_mon < time2), FH_mon > time2], [workpercent[0],workpercent[1],workpercent[2]])
z = np.select([FH_mon < time1 , (FH_mon >= time1) and (FH_mon < time2), FH_mon > time2], [mhpy[0],mhpy[1],mhpy[2]])
if x == [(re.search('CH_', s)) for s in df['Name'].values]:
y = np.select([FH_mon < time3, (FH_mon >= time3) and (FH_mon < time4)], [workpercent[0],workpercent[1]])
z = np.select([FH_mon < time3, (FH_mon >= time3) and (FH_mon < time4)], [mhpy[0],mhpy[1]])
I had a simple version earlier where I was just added a couple numbers, and I was getting really helpful answers to how I asked my question, but here is the more complex version. I need to search through and any time there is a FV in the name column, the if loop runs and uses data from the Name column with FV. Same for CH. I have the lists to keep track of each value as the loop loops through the Name column. If there is a simpler way I would really appreciate seeing it, but right now this seems like the cleanest way but I am receiving errors or the loop will not function properly.
This should be what you want:
for index, row in df.iterrows():
if re.search("FV_", row["Name"]):
df.loc[index, "Value"] += 2
elif re.search("CH_", row["Name"]):
df.loc[index, "Value"] += 4
If the "Name" column only has values starting with "FV_" or "CH_", use where:
df["Value"] = df["Value"].add(2).where(df["Name"].str.startswith("FV_"), df["Value"].add(4))
If you might have other values in "Name", use numpy.select:
import numpy as np
df["Value"] = np.select([df["Name"].str.startswith("FV_"), df["Name"].str.startswith("CH_")], [df["Value"].add(2), df["Value"].add(4)])
>>> df
Name Value
0 CH_1 5
1 CH_2 6
2 CH_3 7
3 FV_1 6
4 FV_2 7
5 FV_3 8

Making permanent change in a dataframe using python pandas

I would like to convert y dataframe from one format (X:XX:XX:XX) of values to another (X.X seconds)
Here is my dataframe looks like:
Start End
0 0:00:00:00
1 0:00:00:00 0:07:37:80
2 0:08:08:56 0:08:10:08
3 0:08:13:40
4 0:08:14:00 0:08:14:84
And I would like to transform it in seconds, something like that
Start End
0 0.0
1 0.0 457.80
2 488.56 490.80
3 493.40
4 494.0 494.84
To do that I did:
i = 0
j = 0
while j < 10:
while i < 10:
if data.iloc[i, j] != "":
Value = (int(data.iloc[i, j][0]) * 3600) + (int(data.iloc[i, j][2:4]) *60) + int(data.iloc[i, j][5:7]) + (int(data.iloc[i, j][8: 10])/100)
NewValue = data.iloc[:, j].replace([data.iloc[i, j]], Value)
i += 1
NewValue = data.iloc[:, j].replace([data.iloc[i, j]], "")
i += 1
i = 0
j += 1
But I failed to replace the new values in my oldest dataframe in a permament way, when I do:
I still get my old data frame in the wrong format.
Some one could hep me? I tried so hard!
Thank you so so much!
You are using pandas.DataFrame.update that requires a pandas dataframe as an argument. See the Example part of the update function documentation to really understand what update does https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.update.html
If I may suggest a more idiomatic solution; you can directly map a function to all values of a pandas Series
def parse_timestring(s):
if s == "":
return s
# weird to use centiseconds and not milliseconds
# l is a list with [hour, minute, second, cs]
l = [int(nbr) for nbr in s.split(":")]
return sum([a*b for a,b in zip(l, (3600, 60, 1, 0.01))])
df["Start"] = df["Start"].map(parse_timestring)
You can remove the if ... else ... from parse_timestring if you replace all empty string with nan values in your dataframe with df = df.replace("", numpy.nan) then use df["Start"] = df["Start"].map(parse_timestring, na_action='ignore')
see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.map.html
The datetimelibrary is made to deal with such data. You should also use the apply function of pandas to avoid iterating on the dataframe like that.
You should proceed as follow :
from datetime import datetime, timedelta
def to_seconds(date):
comp = date.split(':')
delta = (datetime.strptime(':'.join(comp[1:]),"%H:%M:%S") - datetime(1900, 1, 1)) + timedelta(days=int(comp[0]))
return delta.total_seconds()
data['Start'] = data['Start'].apply(to_seconds)
data['End'] = data['End'].apply(to_seconds)
Thank you so much for your help.
Your method was working. I also found a method using loop:
To summarize, my general problem was that I had an ugly csv file that I wanted to transform is a csv usable for doing statistics, and to do that I wanted to use python.
my csv file was like:
MiceID = 1 Beginning End Type of behavior
0 0:00:00:00 Video start
1 0:00:01:36 grooming type 1
2 0:00:03:18 grooming type 2
3 0:00:06:73 0:00:08:16 grooming type 1
So in my ugly csv file I was writing only the moment of the begining of the behavior type without the end when the different types of behaviors directly followed each other, and I was writing the moment of the end of the behavior when the mice stopped to make any grooming, that allowed me to separate sequences of grooming. But this type of csv was not usable for easily making statistics.
So I wanted 1) transform all my value in seconds to have a correct format, 2) then I wanted to fill the gap in the end colonne (a gap has to be fill with the following begining value, as the end of a specific behavior in a sequence is the begining of the following), 3) then I wanted to create columns corresponding to the duration of each behavior, and finally 4) to fill this new column with the duration.
My questionning was about the first step, but I put here the code for each step separately:
step 1: transform the values in a good format
import pandas as pd
import numpy as np
data = pd.read_csv("D:/Python/TestPythonTraitementDonnéesExcel/RawDataBatch2et3.csv", engine = "python")
data.replace(np.nan, "", inplace = True)
i = 0
j = 0
while j < len(data.columns):
while i < len(data.index):
if (":" in data.iloc[i, j]) == True:
Value = str((int(data.iloc[i, j][0]) * 3600) + (int(data.iloc[i, j][2:4]) *60) + int(data.iloc[i, j][5:7]) + (int(data.iloc[i, j][8: 10])/100))
data = data.replace([data.iloc[i, j]], Value)
i += 1
i += 1
i = 0
j += 1
step 2: fill the gaps
i = 0
j = 2
while j < len(data.columns):
while i < len(data.index) - 1:
if data.iloc[i, j] == "":
data.iloc[i, j] = data.iloc[i + 1, j - 1]
i += 1
elif np.all(data.iloc[i:len(data.index), j] == ""):
i += 1
i = 0
j += 4
step 3: create a new colunm for each mice:
j = 1
k = 0
while k < len(data.columns) - 1:
k = (j * 4) + (j - 1)
data.insert(k, "Duree{}".format(k), "")
j += 1
step 3: fill the gaps
j = 4
i = 0
while j < len(data.columns):
while i < len(data.index):
if data.iloc[i, j - 2] != "":
data.iloc[i, j] = str(float(data.iloc[i, j - 2]) - float(data.iloc[i, j - 3]))
i += 1
i = 0
j += 5
And of course, export my new usable dataframe
data.to_csv(r"D:/Python/TestPythonTraitementDonnéesExcel/FichierPropre.csv", index = False, header = True)
here are the transformations:
click on the links for the pictures
before step1
after step 1
after step 2
after step 3
after step 4

Take n rows from a spark dataframe and pass to toPandas()

I have this code:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.withColumn('age2', df.age + 2).toPandas()
Works fine, does what it needs to. Suppose though I only want to display the first n rows, and then call toPandas() to return a pandas dataframe. How do I do it? I can't call take(n) because that doesn't return a dataframe and thus I can't pass it to toPandas().
So to put it another way, how can I take the top n rows from a dataframe and call toPandas() on the resulting dataframe? Can't think this is difficult but I can't figure it out.
I'm using Spark 1.6.0.
You can use the limit(n) function:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.limit(2).withColumn('age2', df.age + 2).toPandas()
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.withColumn('age2', df.age + 2).limit(2).toPandas()
You could get first rows of Spark DataFrame with head and then create Pandas DataFrame:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df_pandas = pd.DataFrame(df.head(3), columns=df.columns)
In [4]: df_pandas
name age
0 Alice 1
1 Jim 2
2 Sandra 3
Try it:
def showDf(df, count=None, percent=None, maxColumns=0):
if (df == None): return
import pandas
from IPython.display import display
pandas.set_option('display.encoding', 'UTF-8')
# Pandas dataframe
dfp = None
# maxColumns param
if (maxColumns >= 0):
if (maxColumns == 0): maxColumns = len(df.columns)
pandas.set_option('display.max_columns', maxColumns)
# count param
if (count == None and percent == None): count = 10 # Default count
if (count != None):
count = int(count)
if (count == 0): count = df.count()
pandas.set_option('display.max_rows', count)
dfp = pandas.DataFrame(df.head(count), columns=df.columns)
# percent param
elif (percent != None):
percent = float(percent)
if (percent >=0.0 and percent <= 1.0):
import datetime
now = datetime.datetime.now()
seed = long(now.strftime("%H%M%S"))
dfs = df.sample(False, percent, seed)
count = df.count()
pandas.set_option('display.max_rows', count)
dfp = dfs.toPandas()
Examples of usages are:
# Shows the ten first rows of the Spark dataframe
showDf(df, 10)
showDf(df, count=10)
# Shows a random sample which represents 15% of the Spark dataframe
showDf(df, percent=0.15)

Pythonic way to simplify multiply 2 pandas columns with condition

I am looking for a way how I can simplify below examples:
self.df[TARGET_NAME] = self.df.apply(lambda row: 1 if row['WINNER'] == 1 and row['WINNER_OVER_2_5'] == 1 else 0, axis=1)
self.df[TARGET_NAME] = self.df[(self.df.WINNER == 1)] &
self.df[(self.df.WINNER_OVER_2_5 == 1)] # not it's not correct
and more complex as below
df["PROFIT"] = np.where((df[TARGET_NAME] == df["PREDICTED"]) & (df["PREDICTED"] == 0),
df['MATCH_HOME'] * df['HOME_STAKE'],
np.where((dfml[TARGET_NAME] == df["PREDICTED"]) & (df["PREDICTED"] == 1),
df['MATCH_DRAW'] * df['DRAW_STAKE'],
np.where((df[TARGET_NAME] == df["PREDICTED"]) & (df["PREDICTED"] == 2),
df['MATCH_AWAY'] * df['AWAY_STAKE'],
IIUC you can use isin:
print df
0 1 0
1 1 1
2 0 2
df['TARGET_NAME'] = np.where((df.WINNER.isin([1]) & df.WINNER_OVER_2_5.isin([1])),1,0)
print df
0 1 0 0
1 1 1 1
2 0 2 0
EDIT (untested, because no data):
df["PROFIT"] = np.where((df[TARGET_NAME] == df["PREDICTED"]) & (df["PREDICTED"].isin([0])),
df['MATCH_HOME'] * df['HOME_STAKE'],
np.where((dfml[TARGET_NAME] == df["PREDICTED"]) & (df["PREDICTED"].isin([1])),
df['MATCH_DRAW'] * df['DRAW_STAKE'],
np.where((df[TARGET_NAME] == df["PREDICTED"]) & (df["PREDICTED"].isin([2])),
df['MATCH_AWAY'] * df['AWAY_STAKE'],
I am looking for a way how I can simplify below examples:
I guess you're looking for a simpler syntax. How about this:
df['MATCH'] = matches(df, values=(0,1), WINNER=1, WINNER_OVER_2_5=1)
Note that values= is optional and takes any tuple(false-value, true-value), defaulting to (False, True).
To get there it takes a bit of magic. Essentially this builds a truth table by chaining the conditions and transforming the result into the values as specified. It ends up doing the same thing as your lambda, just in a generic way.
def matches(df, values=None, **kwargs):
values = values or (False, True)
flt = None
for var, value in kwargs.iteritems():
t = (df[var] == value)
flt = (flt & t) if flt is not None else t
flt = flt.apply(lambda t : values[t])
return flt
maybe you can try with boolean attribute
df= pd.DataFrame({'a':[1,0,1,0],'b' :[1,1,0,np.nan]})
df['NEW']= ((df['a']==1 ) & (df['b']==1)).astype(int).fillna(0)

