Python assign different variables to a class object - python

This is a general python question. Is it possible to assign different variables to a class object and then perform different set of operations on those variables? I'm trying to reduce code but maybe this isn't how it works. For example, I'm trying to do something like this:
Edit: here is an abstract of the class and methods:
class Class:
def __init__(self, df):
self.df = df
def query(self, query):
self.df = self.df.query(query)
return self
def fill(self, filter):
self.df.update(df.filter(like=filter).mask(lambda x: x == 0).ffill(1))
return self
def diff(self, cols=None, axis=1):
diff = self.df[self.df.columns[~self.df.columns.isin(cols)]].diff(axis=axis)
self.df = diff.join(self.df[self.df.columns.difference(diff.columns)])
return self
def melt(self, cols, var=None, value=None):
return pd.melt(self.df, id_vars=columns, var_name=var, value_name=value)
I'm trying to use it like this:
df = pd.read_csv('data.csv')
df = Class(df)
df = df.query(query).forward_fill(include)
df_1 = df.diff(cols).melt(cols)
df_2 = df.melt(cols)
df_1 and df_2 should have different values, however they are the same as df_1. This issue is resolved if I use the class like this:
df_1 = pd.read_csv('data.csv')
df_2 = pd.read_csv('data.csv')
df_1 = Class(df_1)
df_2 = Class(df_2)
df_1 = df_1.query(query).forward_fill(include)
df_2 = df_2.query(query).forward_fill(include)
df_1 = df_1.diff(cols).melt(cols)
df_2 = df_2.melt(cols)
This results in extra code. Is there a better way to do this where you can use an object differently on different variables, or do I have to create seperate objects if I'm trying to have two variables perform separate operations and return different values?

With the return self statement in the diff- method you return the reference of the object. The same thing happens after the melt method. But in that two methods you allreadey manipulated the origin df.
Here:
1 df = pd.read_csv('data.csv')
2
3 df = Class(df)
4 df = df.query(query).forward_fill(include)
5
6 df_1 = df.diff(cols).melt(cols)
the df has the same values like df_1. I guess the melt method without other args then cols arguments only assigns col names or something like that. Subsequently df_2=df.melt(cols) would have the same result like df_2=df_1.melt(cols).
If you want to work with one object, you dont should use self.df=... in your class methods, because this changes the instance value of df. You only need to write df = ... and than return Class(df).
For example:
def diff(self, cols=None, axis=1):
diff = self.df[self.df.columns[~self.df.columns.isin(cols)]].diff(axis=axis)
df = diff.join(self.df[self.df.columns.difference(diff.columns)])
return Class(df)
Best regards

Related

Ungroup pandas dataframe after bfill

I'm trying to write a function that will backfill columns in a dataframe adhearing to a condition. The upfill should only be done within groups. I am however having a hard time getting the group object to ungroup. I have tried reset_index as in the example bellow but that gets an AttributeError.
Accessing the original df through result.obj doesn't lead to the updated value because there is no inplace for the groupby bfill.
def upfill(df:DataFrameGroupBy)->DataFrameGroupBy:
for column in df.obj.columns:
if column.startswith("x"):
df[column].bfill(axis="rows", inplace=True)
return df
Assigning the dataframe column in the function doesn't work because groupbyobject doesn't support item assingment.
def upfill(df:DataFrameGroupBy)->DataFrameGroupBy:
for column in df.obj.columns:
if column.startswith("x"):
df[column] = df[column].bfill()
return df
The test I'm trying to get to pass:
def test_upfill():
df = DataFrame({
"id":[1,2,3,4,5],
"group":[1,2,2,3,3],
"x_value": [4,4,None,None,5],
})
grouped_df = df.groupby("group")
result = upfill(grouped_df)
result.reset_index()
assert result["x_value"].equals(Series([4,4,None,5,5]))
You should use 'transform' method on the grouped DataFrame, like this:
import pandas as pd
def test_upfill():
df = pd.DataFrame({
"id":[1,2,3,4,5],
"group":[1,2,2,3,3],
"x_value": [4,4,None,None,5],
})
result = df.groupby("group").transform(lambda x: x.bfill())
assert result["x_value"].equals(pd.Series([4,4,None,5,5]))
test_upfill()
Here you can find find more information about the transform method on Groupby objects
Based on the accepted answer this is the full solution I got to although I have read elsewhere there are issues using the obj attribute.
def upfill(df:DataFrameGroupBy)->DataFrameGroupBy:
columns = [column for column in df.obj.columns if column.startswith("x")]
df.obj[columns] = df[columns].transform(lambda x:x.bfill())
return df
def test_upfill():
df = DataFrame({
"id":[1,2,3,4,5],
"group":[1,2,2,3,3],
"x_value": [4,4,None,None,5],
})
grouped_df = df.groupby("group")
result = upfill(grouped_df)
assert df["x_value"].equals(Series([4,4,None,5,5]))

Update a global data frame through a class method

I would like to append rows in a data frame which is specified as the first argument of the following class and according to a name, which is the second argument, when I instantiate an instance.
My problem is that I would like to update the state of the data frame which is passed as the second argument and when I call it I don't get the updated status.
import pandas as pd
class RecordClass(object):
def __init__(self, df, name):
self.name = name
self.df = df
def write_method(self, *args):
keys = ['key1', 'key2', 'key3']
dictionary = dict()
dictionary['name'] = self.name
for idx, key in enumerate(keys):
dictionary[key] = args[idx]
self.df = self.df.append(dictionary, ignore_index=True)
df = self.df[keys]
return self.df
df1 = pd.DataFrame()
data = [1, 2, 3]
instance1 = RecordClass(df1, 'a')
print instance1.write_method(*data)
print
print df1
The result I get is:
key1 key2 key3 name
0 1.0 2.0 3.0 a
Empty DataFrame
Columns: []
Index: []
which means that df1 is not updated. How can I update df1 after calling the write_method method, without the assignment df1 = instance1.write_method(...)?
You can't. append is not an inplace operation, and when you reassign self.df with the result of the append, you are simply creating a new object, completely different from the original, and you are assigning that object to self.df. The original object that self.df (and df1) pointed to is not changed.
If this is important functionality, I can recommend creating a DataFrame with NaN entries, and filling them using loc or [:] assignment.

pandas: fill a column by applying a class method to another column (which contains classes)

i have a pandas dataframe where one of the column is filled with class objects, like the code below:
import pandas as pd
class rec:
def test(self, a):
return a
class rec1:
def test(self, a):
return a*3
x= rec()
y = rec1()
list = [x,y]
df=pd.DataFrame(list, columns=['first'])
df['second']=['a1','b1']
print(df)
first second
0 <__main__.rec object at 0x000000180AAE9208> a1
1 <__main__.rec1 object at 0x000000180AACBEB8> b1
now, i wish to create a new column by applying the method "test" to column 'first', by reading input for "test" from column 'second'.
this loop works:
df['third']=0
for i in (0,1):
df['third'][i] = df['first'][i].test(df['second'][i])
but i wonder if i can avoid the loop and use something more similar to the following code (which does not work):
df['third'] = df['first'].test(df['second'])
any advice? thank you
This isn't that hard to do actually. You can use np.vectorize.
f = lambda x, y: x.test(y)
v = np.vectorize(f)
df['third'] = v(df['first'], df['second'])
df
first second third
0 <__main__.rec object at 0x1038b1ef0> a1 a1
1 <__main__.rec1 object at 0x1038b1c18> b1 b1b1b1

Changing self.variables inside __exit__ method of Context Managers

First thing first, the title is very unclear, however nothing better sprang to my mind. I'll ellaborate the problem in more detail.
I've found myself doing this routine a lot with pandas dataframes. I need to work for a while with only the part(some columns) of the DataFrame and later I want to add those columns back. The an idea came to my mind = Context Managers. But I am unable to come up with the correct implementation (if there is any..).
import pandas as pd
import numpy as np
class ProtectColumns:
def __init__(self, df, protect_cols=[]):
self.protect_cols = protect_cols
# preserve a copy of the part we want to protect
self.protected_df = df[protect_cols].copy(deep=True)
# create self.df with only the part we want to work on
self.df = df[[x for x in df.columns if x not in protect_cols]]
def __enter__(self):
# return self, or maybe only self.df?
return self
def __exit__(self, *args, **kwargs):
# btw. do i need *args and **kwargs here?
# append the preserved data back to the original, now changed
self.df[self.protect_cols] = self.protected_df
if __name__ == '__main__':
# testing
# create random DataFrame
df = pd.DataFrame(np.random.randn(6,4), columns=list("ABCD"))
# uneccessary step
df = df.applymap(lambda x: int(100 * x))
# show it
print(df)
# work without cols A and B
with ProtectColumns(df, ["A", "B"]) as PC:
# make everything 0
PC.df = PC.df.applymap(lambda x: 0)
# this prints the expected output
print(PC.df)
However, say I don't want to use PC.df onwards, but df. I could just do df = PC.df, or make a copy inside with or after that. But is is possible to handle this inside e.g. the __exit__ method?
# unchanged df
print(df)
with ProtectColumns(df, list("AB")) as PC:
PC.applymap(somefunction)
# df is now changed
print(df)
Thanks for any ideas!

How can I add a variable amount of columns in Pandas?

I have a function which I apply it on the rows of a dataframe. This function returns a list of variable length depending on a parameter.
For now I use the following example code:
import pandas as pd
df = pd.read_csv("data.csv")
def add_columns(x, amount):
return range(amount)
df["L1"], df["L2"], df["L3"] = zip(*df.apply(lambda x: add_columns(x, 3), axis=1))
Is there a way to add the labels automatically ?
Not sure if I understand your question correctly in what you want to populate your columns with but this should work:
import pandas as pd
import numpy as np
def add_columns(x, *args):
col_names = args[0]
return pd.Series({i: x for i in col_names})
def add_range(x, *args):
col_names = args[1]
return pd.Series({k: v for k,v in zip(args[1],range(args[0]))})
df = pd.DataFrame(np.random.uniform(size=(10,2)),columns=["A","B"])
labels = ["L1","L2","L3"]
# This populates with values from "A" column
df.merge(df["A"].apply(add_columns, args=([labels])),left_index=True, right_index=True)
# This populates with values from range(number_passed) function
df.merge(df["A"].apply(add_range, args=([3,labels])),left_index=True, right_index=True)

Categories

Resources