I have a function which I apply it on the rows of a dataframe. This function returns a list of variable length depending on a parameter.
For now I use the following example code:
import pandas as pd
df = pd.read_csv("data.csv")
def add_columns(x, amount):
return range(amount)
df["L1"], df["L2"], df["L3"] = zip(*df.apply(lambda x: add_columns(x, 3), axis=1))
Is there a way to add the labels automatically ?
Not sure if I understand your question correctly in what you want to populate your columns with but this should work:
import pandas as pd
import numpy as np
def add_columns(x, *args):
col_names = args[0]
return pd.Series({i: x for i in col_names})
def add_range(x, *args):
col_names = args[1]
return pd.Series({k: v for k,v in zip(args[1],range(args[0]))})
df = pd.DataFrame(np.random.uniform(size=(10,2)),columns=["A","B"])
labels = ["L1","L2","L3"]
# This populates with values from "A" column
df.merge(df["A"].apply(add_columns, args=([labels])),left_index=True, right_index=True)
# This populates with values from range(number_passed) function
df.merge(df["A"].apply(add_range, args=([3,labels])),left_index=True, right_index=True)
Related
I have the following dataframe:
import pandas as pd
import numpy as np
from pandas_datareader import data as pdr
from datetime import date, timedelta
yf.pdr_override()
end = date.today()
start = end - timedelta(days=7300)
# download dataframe
data = pdr.get_data_yahoo('^GSPC', start=start, end= end)
Now, that I have the dataframe, I want to create a function to add the logarithmic return based on a column to the dataframe called 'data', with the following code:
data['log_return'] = np.log(data['Adj Close'] / data['Adj Close'].shift(1))
How I think the function should look like is like this:
def add_log_return(df):
# add returns in a logarithmic fashion
added = df.copy()
added["log_return"] = np.log(df[column] / df[column].shift(1))
added["log_return"] = added["log_return"].apply(lambda x: x*100)
return added
How can I select a specific column as an input of the function add_log_return(df['Adj Close']), so the function adds the logarithmic return to my 'data' dataframe?
data = add_log_return(df['Adj Close'])
Just add an argument column to your function!
def add_log_return(df, column):
# add returns in a logarithmic fashion
added = df.copy()
added["log_return"] = np.log(df[column] / df[column].shift(1)) * 100
return added
new_df = add_log_return(old_df, 'Adj_Close')
Note I removed the line in your function to apply a lambda that just multiplied by 100. It's much faster to do this in a vectorized manner, by including it in the np.log(...) line
However, if I were you, I'd just return the Series object instead of copying the dataframe and modifying and returning the copy.
def log_return(col: pd.Series) -> np.ndarray:
return np.log(col / col.shift(1)) * 100
Now, the caller can do what they want with it:
df['log_ret'] = log_return(df['Adj_Close'])
I'm trying to write a function that will backfill columns in a dataframe adhearing to a condition. The upfill should only be done within groups. I am however having a hard time getting the group object to ungroup. I have tried reset_index as in the example bellow but that gets an AttributeError.
Accessing the original df through result.obj doesn't lead to the updated value because there is no inplace for the groupby bfill.
def upfill(df:DataFrameGroupBy)->DataFrameGroupBy:
for column in df.obj.columns:
if column.startswith("x"):
df[column].bfill(axis="rows", inplace=True)
return df
Assigning the dataframe column in the function doesn't work because groupbyobject doesn't support item assingment.
def upfill(df:DataFrameGroupBy)->DataFrameGroupBy:
for column in df.obj.columns:
if column.startswith("x"):
df[column] = df[column].bfill()
return df
The test I'm trying to get to pass:
def test_upfill():
df = DataFrame({
"id":[1,2,3,4,5],
"group":[1,2,2,3,3],
"x_value": [4,4,None,None,5],
})
grouped_df = df.groupby("group")
result = upfill(grouped_df)
result.reset_index()
assert result["x_value"].equals(Series([4,4,None,5,5]))
You should use 'transform' method on the grouped DataFrame, like this:
import pandas as pd
def test_upfill():
df = pd.DataFrame({
"id":[1,2,3,4,5],
"group":[1,2,2,3,3],
"x_value": [4,4,None,None,5],
})
result = df.groupby("group").transform(lambda x: x.bfill())
assert result["x_value"].equals(pd.Series([4,4,None,5,5]))
test_upfill()
Here you can find find more information about the transform method on Groupby objects
Based on the accepted answer this is the full solution I got to although I have read elsewhere there are issues using the obj attribute.
def upfill(df:DataFrameGroupBy)->DataFrameGroupBy:
columns = [column for column in df.obj.columns if column.startswith("x")]
df.obj[columns] = df[columns].transform(lambda x:x.bfill())
return df
def test_upfill():
df = DataFrame({
"id":[1,2,3,4,5],
"group":[1,2,2,3,3],
"x_value": [4,4,None,None,5],
})
grouped_df = df.groupby("group")
result = upfill(grouped_df)
assert df["x_value"].equals(Series([4,4,None,5,5]))
Trying to compute a value for each row of dataframe in parallel using the following code, but getting errors either when I pass individual input ranges or the combination:
#!pip install pyblaze
import itertools
import pyblaze
import pyblaze.multiprocessing as xmp
import pandas as pd
inputs = [range(2),range(2),range(3)]
inputs_list = list(itertools.product(*inputs))
Index = pd.MultiIndex.from_tuples(inputs_list,names={"a", "b", "c"})
df = pd.DataFrame(index = Index)
df['Output'] = 0
print(df)
def Addition(A,B,C):
df.loc[A,B,C]['Output']=A+B+C
return df
def parallel(inputs_list):
tokenizer = xmp.Vectorizer(Addition, num_workers=8)
return tokenizer.process(inputs_list)
parallel(inputs_list)
I know how to apply an IF condition in Pandas DataFrame. link
However, my question is how to do the following:
if (df[df['col1'] == 0]):
sys.path.append("/desktop/folder/")
import self_module as sm
df = sm.call_function(df)
What I really want to do is when value in col1 equals to 0 then call function call_function().
def call_function(ds):
ds['new_age'] = (ds['age']* 0.012345678901).round(12)
return ds
I provide a simple example above for call_function().
Since your function interacts with multiple columns and returns a whole data frame, run conditional logic inside the method:
def call_function(ds):
ds['new_age'] = np.nan
ds.loc[ds['col'] == 0, 'new_age'] = ds['age'].mul(0.012345678901).round(12)
return ds
df = call_function(df)
If you are unable to modify the function, run method on splits of data frame and concat or append together. Any new columns in other split will be have values filled with NAs.
def call_function(ds):
ds['new_age'] = (ds['age']* 0.012345678901).round(12)
return ds
df = pd.concat([call_function(df[df['col'] == 0].copy()),
df[df['col'] != 0].copy()])
I am trying to replicate a simple Technical-Analysis indicator using xlwings. However, the list/data seems not to be able to read Excel values. Below is the code
import pandas as pd
import datetime as dt
import numpy as np
#xw.func
def EMA(df, n):
EMA = pd.Series(pd.ewma(df['Close'], span = n, min_periods = n - 1), name = 'EMA_' + str(n))
df = df.join(EMA)
return df
When I enter a list of excel data : EMA = ({1,2,3,4,5}, 5}, I get the following error message
TypeError: list indices must be integers, not str EMA = pd.Series(pd.ewma(df['Close'], span = n, min_periods = n - 1), name = 'EMA_' + str(n))
(Expert) help much appreciated! Thanks.
EMA() expects a DataFrame df and a scalar n, and it returns the EMA in a separate column in the source DataFrame. You are passing a simple list of values, this is not supposed to work.
Construct a DataFrame and assign the values to the Close column:
v = range(100) # use your list of values instead
df = pd.DataFrame(v, columns=['Close'])
Call EMA() with this DataFrame:
EMA(df, 5)