class Dataframe: #Recommended to instatiate your dataframe with your csv name.
"""
otg_merge = Dataframe("/Users/zachary/Desktop/otg_merge.csv") #instaiate as a pandas dataframe
"""
def __init__(self, filepath, filename = None):
pd = __import__('pandas') #import pandas when the class is instatiated
self.filepath = filepath
self.filename = filename
def df(self): #it makes the DataFrame
df = pd.read_csv(self.filepath, encoding = "cp949", index_col= 0) #index col is not included
return df
def shape(self): #it returns the Dimension of DataFrame
shape = list(df.shape)
return shape
def head(self): #it reutrns the Head of Dataframe
primer = pd.DataFrame.head(df)
del primer["Unnamed: 0"]
return primer
def cust_types(self): #it returns the list of cust_type included in .csv
cust_type = []
for i in range(0, shape[0]):
if df.at[i, "cust_type"] not in cust_type: #if it's new..
cust_type.append(df.at[i, "cust_type"]) #append it as a new list element
return cust_type
I am doing some wrapping pandas functions wrapping for whom doesn't necessarily need to know the pandas.
If you see the code, at the third def, shape returns shape as a list of such as [11000, 134] as a xdim and ydim.
Now I'd like to use the shape again at the last def cust_types, however,, it returns the shape is not defined.
How can I share the variable "share" across defs in the same class?
intersetingly, I didn't do nth, but the df is shared from second df to thrid shape without error
First prepend "self." in all your attributes which you will know after trying out some python oops tutorials. Another issue which you might miss is
def df(self):
df = pd.read_csv(self.filepath, encoding = "cp949", index_col= 0)
return df
Here, the method name and the variable name takes the same name which is fine, if the variable name is not an instance attribute as it is not. But in case if you prepend "self." and make it as an instance attribute, your instance attribute will be self.df and it can't be a function after the first function call self.df().
Related
I'm sure I'm missing something in how classes work here, but basically this is my class:
import pandas as pd
import numpy as np
import scipy
#example DF with OHLC columns and 100 rows
gold = pd.DataFrame({'Open':[i for i in range(100)],'Close':[i for i in range(100)],'High':[i for i in range(100)],'Low':[i for i in range(100)]})
class Backtest:
def __init__(self, ticker, df):
self.ticker = ticker
self.df = df
self.levels = pivot_points(self.df)
def pivot_points(self,df,period=30):
highs = scipy.signal.argrelmax(df.High.values,order=period)
lows = scipy.signal.argrelmin(df.Low.values,order=period)
return list(df.High[highs[0]]) + list(df.Low[lows[0]])
inst = Backtest('gold',gold) #gold is a Pandas Dataframe with Open High Low Close columns and data
inst.levels # This give me the whole dataframe (inst.df) instead of the expected output of the pivot_point function (a list of integers)
The problem is inst.levels returns the whole DataFrame instead of the return value of the function pivot_points (which is supposed to be a list of integers)
When I called the pivot_points function on the same DataFrame outside this class I got the list I expected
I expected to get the result of the pivot_points() function after assigning it to self.levels inside the init but instead I got the entire DataFrame
You would have to address pivot_points() as self.pivot_points()
And there is no need to add period as an argument if you are not changing it, if you are, its okay there.
I'm not sure if this helps, but here are some tips about your class:
class Backtest:
def __init__(self, ticker, df):
self.ticker = ticker
self.df = df
# no need to define a instance variable here, you can access the method directly
# self.levels = pivot_points(self.df)
def pivot_points(self):
period = 30
# period is a local variable to pivot_points so I can access it directly
print(f'period inside Backtest.pivot_points: {period}')
# df is an instance variable and can be accessed in any method of Backtest after it is instantiated
print(f'self.df inside Backtest.pivot_points(): {self.df}')
# to get any values out of pivot_points we return some calcualtions
return 1 + 1
# if you do need an attribute like level to access it by inst.level you could create a property
#property
def level(self):
return self.pivot_points()
gold = 'some data'
inst = Backtest('gold', gold) # gold is a Pandas Dataframe with Open High Low Close columns and data
print(f'inst.pivot_points() outside the class: {inst.pivot_points()}')
print(f'inst.level outside the class: {inst.level}')
This would be the result:
period inside Backtest.pivot_points: 30
self.df inside Backtest.pivot_points(): some data
inst.pivot_points() outside the class: 2
period inside Backtest.pivot_points: 30
self.df inside Backtest.pivot_points(): some data
inst.level outside the class: 2
Thanks to the commenter Henry Ecker I found that I had the function by the same name defined elsewhere in the file where the output is the df. After changing that my original code is working as expected
This is a general python question. Is it possible to assign different variables to a class object and then perform different set of operations on those variables? I'm trying to reduce code but maybe this isn't how it works. For example, I'm trying to do something like this:
Edit: here is an abstract of the class and methods:
class Class:
def __init__(self, df):
self.df = df
def query(self, query):
self.df = self.df.query(query)
return self
def fill(self, filter):
self.df.update(df.filter(like=filter).mask(lambda x: x == 0).ffill(1))
return self
def diff(self, cols=None, axis=1):
diff = self.df[self.df.columns[~self.df.columns.isin(cols)]].diff(axis=axis)
self.df = diff.join(self.df[self.df.columns.difference(diff.columns)])
return self
def melt(self, cols, var=None, value=None):
return pd.melt(self.df, id_vars=columns, var_name=var, value_name=value)
I'm trying to use it like this:
df = pd.read_csv('data.csv')
df = Class(df)
df = df.query(query).forward_fill(include)
df_1 = df.diff(cols).melt(cols)
df_2 = df.melt(cols)
df_1 and df_2 should have different values, however they are the same as df_1. This issue is resolved if I use the class like this:
df_1 = pd.read_csv('data.csv')
df_2 = pd.read_csv('data.csv')
df_1 = Class(df_1)
df_2 = Class(df_2)
df_1 = df_1.query(query).forward_fill(include)
df_2 = df_2.query(query).forward_fill(include)
df_1 = df_1.diff(cols).melt(cols)
df_2 = df_2.melt(cols)
This results in extra code. Is there a better way to do this where you can use an object differently on different variables, or do I have to create seperate objects if I'm trying to have two variables perform separate operations and return different values?
With the return self statement in the diff- method you return the reference of the object. The same thing happens after the melt method. But in that two methods you allreadey manipulated the origin df.
Here:
1 df = pd.read_csv('data.csv')
2
3 df = Class(df)
4 df = df.query(query).forward_fill(include)
5
6 df_1 = df.diff(cols).melt(cols)
the df has the same values like df_1. I guess the melt method without other args then cols arguments only assigns col names or something like that. Subsequently df_2=df.melt(cols) would have the same result like df_2=df_1.melt(cols).
If you want to work with one object, you dont should use self.df=... in your class methods, because this changes the instance value of df. You only need to write df = ... and than return Class(df).
For example:
def diff(self, cols=None, axis=1):
diff = self.df[self.df.columns[~self.df.columns.isin(cols)]].diff(axis=axis)
df = diff.join(self.df[self.df.columns.difference(diff.columns)])
return Class(df)
Best regards
I am writing a class containing pandas functionalities. As an input I have a pandas dataframe but python seems to not recognizing it right.
import pandas as pd
class box:
def __init__(self, dataFrame, pers, limit):
self.df = dataFrame,
self.pers = pers,
self.data = limit
def cleanDataset(self):
persDf = self.df.filter(regex=('^' + self.pers + r'[1-9]$'))
persDF.replace({'-': None})
self.df.filter(...) gives me the warning: Instance of 'tuple' has no 'filter' member. I have found this but cannot apply the solution though since the problem is not caused by django.
Anyone who can help me out here?
Your problem is the comma at the end of self.df = dataFrame, (and self.pers = pers,). The comma isn't necessary here.
The comma makes the class think you're defining self.df as a tuple with one member. To check this, create a box object b and try print type(box.df). I'm guessing this will return <type 'tuple'>.
Remove the commas after the attribute definitions:
class box:
def __init__(self, dataFrame, pers, limit):
self.df = dataFrame
self.pers = pers
self.data = limit
I have the following class and the print statement returns an empty dataframe even though I'm sure my get_percent_change method is returning the values. I even tried just assigning test to three. Still, empty dataframe.
Is it something to do with the fact it's inside a class? Inside the init method? I tried using self.metrics too.
class options_metrics:
def __init__(self, calls, puts):
self.calls, self.puts = calls, puts
self.calls = self.calls.drop(["Type"])
self.puts = self.puts.drop(["Type"])
metrics = pd.DataFrame()
metrics['Perc_Chg_Vol_Call'], metrics['Perc_Chg_Open_Int_Call'] = self.get_percent_change(self.calls)
metrics['Test'] = 3
print(metrics)
input()
def get_percent_change(self, option_df):
perc_changes = option_df.pct_change(axis=1)
print(perc_changes)
return (perc_changes.ix['Vol',1], perc_changes.ix['Open_Int',1])
Here is the output:
Empty DataFrame
Columns: [Perc_Chg_Vol_Call, Perc_Chg_Open_Int_Call, Test]
Index: []
Switching the DataFrame to a Series worked.
First thing first, the title is very unclear, however nothing better sprang to my mind. I'll ellaborate the problem in more detail.
I've found myself doing this routine a lot with pandas dataframes. I need to work for a while with only the part(some columns) of the DataFrame and later I want to add those columns back. The an idea came to my mind = Context Managers. But I am unable to come up with the correct implementation (if there is any..).
import pandas as pd
import numpy as np
class ProtectColumns:
def __init__(self, df, protect_cols=[]):
self.protect_cols = protect_cols
# preserve a copy of the part we want to protect
self.protected_df = df[protect_cols].copy(deep=True)
# create self.df with only the part we want to work on
self.df = df[[x for x in df.columns if x not in protect_cols]]
def __enter__(self):
# return self, or maybe only self.df?
return self
def __exit__(self, *args, **kwargs):
# btw. do i need *args and **kwargs here?
# append the preserved data back to the original, now changed
self.df[self.protect_cols] = self.protected_df
if __name__ == '__main__':
# testing
# create random DataFrame
df = pd.DataFrame(np.random.randn(6,4), columns=list("ABCD"))
# uneccessary step
df = df.applymap(lambda x: int(100 * x))
# show it
print(df)
# work without cols A and B
with ProtectColumns(df, ["A", "B"]) as PC:
# make everything 0
PC.df = PC.df.applymap(lambda x: 0)
# this prints the expected output
print(PC.df)
However, say I don't want to use PC.df onwards, but df. I could just do df = PC.df, or make a copy inside with or after that. But is is possible to handle this inside e.g. the __exit__ method?
# unchanged df
print(df)
with ProtectColumns(df, list("AB")) as PC:
PC.applymap(somefunction)
# df is now changed
print(df)
Thanks for any ideas!