Pythonic way to declare multiple empty dataframes in a class? - python

I have a class like below. I am wondering what is the most pythonic way to declare and initialize multiple empty dataframes?
import pandas as pd
class ReadData:
def __init__(self, input_dir):
self.df1 = pd.DataFrame(data=None)
self.df2 = pd.DataFrame(data=None)
self.df3 = pd.DataFrame(data=None)
self.input_dir = input_dir
def read_inputs():
self.df1 = pd.read_csv(self.input_dir+"/file1.csv")
self.df2 = pd.read_csv(self.input_dir+"/file2.csv")
self.df3 = pd.read_csv(self.input_dir+"/file3.csv")
ReadData("./").read_inputs()

In general, dataframes are not supposed to be initialized empty and appended to (appending to dataframes is a slow memory intensive operation). You'll be better off storing your data in structures that can append data quickly such as a list.
However, to answer your question, you can use a dictionary comprehension and keep your dataframes in a dictionary. Or you can do the same with a list.
import pandas as pd
class Data:
def __init__(self):
self.dfs = {
"df{}".format(i): pd.DataFrame(data=None)
for i in range(3)
}
Then you can access your data likeso:
data = Data()
data.dfs["df1"]
Though the power of using a dictionary is that you can explicitly name your data. So a structure like this may be more intuitive:
class Data:
def __init__(self, df_names):
self.dfs = {
name: pd.DataFrame(data=None) for name in df_names
}
data = Data(df_names=["df1", "better_named_df", "averages"])
# accessing underlying frames
data.dfs["df1"]
data.dfs["better_named_df"]
Another approach using a list-comprehension instead of a dictionary:
import pandas as pd
class Data:
def __init__(self):
self.dfs = [pd.DataFrame(data=None) for _ in range(3)]
data = Data()
data.dfs[0]
data.dfs[1]
Since you specified that you're just reading in these dataframes to run different queries against them, I wouldn't recommend a class at all. This is because there no common functionality that you're going to run against each dataframe, aside from reading them into memory. A function that returns a dictionary should suffice:
import pathlib
import pandas as pd
def read_data(base_dir, file_names):
dataframes = {}
base_dir = pathlib.Path(base_dir)
for fname in file_names:
fpath = base_dir / fname
dataframes[fpath.stem] = pd.read_csv(fpath)
return dataframes
# you can call this function like so:
dfs = read_data("./", ["file1.csv", "file2.csv", "file3.csv"])
# frames is a dictionary with this structure:
# {"file1": dataframe from file1.csv,
# "file2": dataframe from file2.csv,
# "file3": dataframe from file3.csv}
# access data like this
dfs["file1"]

If you are intent on having each DataFrame be an attribute you can take advantage of setattr.
class Data:
def __init__(self, n):
for num in range(1, n + 1):
setattr(self, f"df{num}", pd.DataFrame())
Then whatever number you supply to the constructor, you would have that many DataFrame attributes on the object.

Related

How to remove lines with empty elements within a lists converted from a pandas data frame using python?

So I try to convert a pandas data frame to my customized class function and here is the code for it:
import os
import pandas as pd
import math
cwd = os.path.abspath('')
files = os.listdir(cwd)
df = pd.DataFrame()
for file in files:
if file.endswith('.XLSX'):
df = df.append(pd.read_excel(file), ignore_index=True)
#print(df)
array = df.values.tolist()
print(array)
class Item():
def __init__(self, name, cost, gender, prime):
self.__name = name
self.__cost = cost
self.__gender = gender
self.__prime = prime
def __repr__(self):
return f"Item({self.__name},{self.__cost},{self.__gender},{self.__prime})"
mylist = [Item(*k) for k in array if k[0] and k[1] and k[2] and k[3]]
#print(mylist)
However, there are missing elements in the data frame, so when converting it to the list using array = df.values.tolist() instead of being an "None" for the empty part, the result would produce "nan" instead. This, in fact will cause the filtering process in "mylist" not working.
So, can you should me the code to do instead. Thank you in advance.
Much easier to do while it's still a pandas DataFrame. If you insert a
df.dropna(inplace=True)
before you df.values.tolist() then any rows with missing values should be removed.
There are two ways
Use filter
import math
...
array = df.values.tolist()
array = filter(lambda e: all(map(lambda ee: not isinstance(ee, (float, int) or not math.isnan(ee), e)), array))
...
Use pandas
...
df = df.dropna()
array = df.values.tolist()
...

How to use Pandas to work on a data loaded from my own created class?

I got a small problems on working with Pandas. The problem is I created a file that stores class to read and clean data from a .csv file. and I import my own library to load the data and then i want to use the pandas dataframe for other operations. But for some reason, I can't do it.
So, here is the code I created a class for loading/reading the file:
import pandas as pd
class Load_Data:
def __init__(self, filename):
self.__filename = filename
def load(self):
df = pd.read_csv(self.__filename)
del df["Remarks"]
df = df.dropna()
return df
and in another file, i was trying to import this self-created library for data processing step and then try to work on it with Pandas DataFrame.
from Load_Data import Load_Data
import pandas as pd
test_df = Load_Data("Final_file.csv")
test_df.load()
There is no problem printing the table of the content from my file. But when I tried to use it (test_df) as a Pandas dataframe, for example, I want to GroupBy some of the attributes
test_df.groupby(['width','length])
it ends up showing:
'Load_Data' object has no attribute 'groupby'
which means if i want to use the groupby function, i have to write it on my own in my own class. but I don't want to do that. I just want to convert my class to a Pandas DataFrame and work using their package directly for some complex operations.
I would be really appreciate for any kindly helps
You are using class as if it was a function. Push return statement inside load method
import pandas as pd
class Load_Data:
def __init__(self, filename):
self.__filename = filename
def load(self):
df = pd.read_csv(self.__filename)
del df["Remarks"]
df = df.dropna()
return df # this change
Usage:
test_df = Load_Data("Final_file.csv").load() #this change
# or
load_data = Load_Data("Final_file.csv")
test_df = load_data.load()
load returns a DataFrame and not a Load_Data instance.
Can you share the next line or two which throw an error?
Are you referencing the returned data, or the class?
I.e.
df2= test_df.load()
df2.groupby()
Or
test_df.groupby()
Are you trying to create a new data frame class build on pandas?
If so you'd need something like this (might work)
class LoadDF(pd.DataFrame)
def __init__(self, filename):
self.__filename = filename
def load(self):
df = pd.read_csv(self.__filename)
del df["Remarks"]
df = df.dropna()
self = df

Trouble using classes to call an instance on a dataframe object

Newbie at dealing with classes.
I have some dataframe objects I want to transform, but I'm having trouble manipulating them with classes. Below is an example. The goal is to transpose a dataframe and reassign it to its original variable name. In this case, the dataframe is assets.
import pandas as pd
from requests import get
import numpy as np
html = get("https://www.cbn.gov.ng/rates/Assets.asp").text
table = pd.read_html(html,skiprows=[0,1])[2]
assets = table[1:13]
class Array_Df_Retitle:
def __init__(self,df):
self.df = df
def change(self):
self.df = self.df.transpose()
self.df.columns = self.df[0]
return self.df
However, calling assets = Array_Df_Retitle(assets).change() simply yields an error:
KeyError: 0
I'd like to know where I'm getting things wrong.
I made a few changes to your code. The problem is coming from self.df[0]. This means you are selecting the column named 0. However, after transposing, you will not have any column named 0. You will have a row instead.
import pandas as pd
from requests import get
import numpy as np
html = get("https://www.cbn.gov.ng/rates/Assets.asp").text
table = pd.read_html(html,skiprows=[0,1])[2]
assets = table[1:13]
class Array_Df_Retitle:
def __init__(self,df):
self.df = df
def change(self):
self.df = self.df.dropna(how='all').transpose()
self.df.columns = self.df.loc[0,:]
return self.df.drop(0).reset_index(drop=True)
Array_Df_Retitle(assets).change()

create a list of pandas data frame variable names with similar spelling

In my environment I have a list of several pandas data frames that are similarly named.
For example:
import pandas as pd
import numpy as np
df_abc = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD')
df_xyz = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD')
df_2017 = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD')
... potentially others
I'd like to create list that automatically figures which data frames are in my environment and pulls them into a list dynamically.
list_of_dfs = ['df_abc','df_xyz','df_2017', 'df_anything else']
# except done dynamically. In this example anything beginning with 'df_'
# list_of_dfs = function_help(begins with 'df_')
globals() should return a dictionary of variable_name:variable_value for the global variables.
If you want a list of defined variables with names starting with 'df_' you could do:
list_of_dfs = [variable for variable in globals().keys()
if variable.startswith('df_')]
I reckon there has to be a better way than storing your dataframes globally, and relying on globals() to fetch their variable names though. Maybe store them all inside a dictionary?:
dataframes = {}
dataframes['df_1'] = pd.DataFrame()
dataframes['df_2'] = pd.DataFrame()
list_of_dfs = dataframes.keys()

Passing a python dataframe to an object and altering the dataframe

I am new to python and I am trying to pass an argument (dataframe) to a function and change value of the argument (dataframe) by reading an excel file.
(Assume that I have imported all the necessary files)
I have noticed that python does not pass the argument by reference here and I end up not having the dataframe initialized/changed.
I read that python passes by object-reference and not by value or reference. However, I do not need to change the same dataframe.
The output is : class 'pandas.core.frame.DataFrame'>
from pandas import DataFrame as df
class Data:
x = df
#staticmethod
def import_File(df_name , file):
df_name = pd.io.excel.read_excel(file.replace('"',''), sheetname='Sheet1', header=0, skiprows=None, skip_footer=0, index_col=None, parse_cols=None, parse_dates=True, date_parser=True, na_values=None, thousands=None, convert_float=True, has_index_names=None, converters=None, engine=None )
def inputdata():
Data.import_File(Data.x,r"C:\Users\Data\try.xlsx")
print(Data.x)
You seem to be doing a lot of things the hard way. I'll try to simplify it while conforming to standard patterns of use.
# Whatever imports you need
import pandas as pd
# Static variables and methods should generally be avoided.
# Change class and variable names to whatever is more suitable.
# Names should be meaningful when possible.
class MyData:
# Load data in constructor. Could easily do this in another method.
def __init__(self, filename):
self.data = pd.io.excel.read_excel(filename, sheetname='Sheet1')
def inputData():
# In my experience, forward slashes work just fine on Windows.
# Create new MyData object using constructor
x = MyData('C:/Users/Data/try.xlsx')
# Access member variable from object
print(x.data)
Here's the version where it loads in a method rather than the constructor.
import pandas as pd
class MyData:
# Constructor
def __init__(self):
# Whatever setup you need
self.data = None
self.loaded = False
# Method with optional argument
def loadFile(self, filename, sheetname='Sheet1')
self.data = pd.io.excel.read_excel(filename, sheetname=sheetname)
self.loaded = True
def inputData():
x = MyData()
x.loadFile('C:/Users/Data/try.xlsx')
print(x.data)
# load some other data, using sheetname 'Sheet2' instead of default
y = MyData()
y.loadFile('C:/Users/Data/tryagain.xlsx', 'Sheet2')
# can also pass arguments by name in any order like this:
# y.loadFile(sheetname='Sheet2', filename='C:/Users/Data/tryagain.xlsx')
print(y.data)
# x and y both still exist with different data.
# calling x.loadFile() again with a different path will overwrite its data.
The reason why it doesn't save in your original code is because assigning values to argument names never changes the original variable in Python. What you can do is something like this:
# Continuing from the last code block
def loadDefault(data):
data.loadFile('C:/Users/Data/try.xlsx')
def testReference():
x = MyData()
loadDefault(x)
# x.data now has been loaded
print(x.data)
# Another example
def setIndex0(variable, value):
variable[0] = value
def testSetIndex0():
v = ['hello', 'world']
setIndex0(v, 'Good morning')
# v[0] now equals 'Good morning'
print(v[0])
But you can't do this:
def setString(variable, value):
# The only thing this changes is the value of variable inside this function.
variable = value
def testSetString():
v = 'Start'
setString(v, 'Finish')
# v is still 'Start'
print(v)
If you want to be able to specify the location to store a value using a name, you could use a data structure with indexes/keys. Dictionaries let you access and store values using a key.
import pandas as pd
class MyData:
# Constructor
def __init__(self):
# make data a dictionary
self.data = {}
# Method with optional argument
def loadFile(self, storename, filename, sheetname='Sheet1')
self.data[storename] = pd.io.excel.read_excel(filename, sheetname=sheetname)
# Access method
def getData(self, name):
return self.data[name]
def inputData():
x = MyData()
x.loadFile('name1', 'C:/Users/Data/try.xlsx')
x.loadFile('name2', 'C:/Users/Data/tryagain.xlsx', 'Sheet2')
# access Sheet1
print(x.getData('name1'))
# access Sheet2
print(x.getData('name2'))
If you really want the function to be static, then you don't need to make a new class at all. The main reason for creating a class is to use it as a reusable structure to hold data with methods specific to that data.
import pandas as pd
# wrap read_excel to make it easier to use
def loadFile(filename, sheetname='Sheet1'):
return pd.io.excel.read_excel(filename, sheetname=sheetname)
def inputData():
x = loadFile('C:/Users/Data/try.xlsx')
print(x)
# the above is exactly the same as
x = pd.io.excel.read_excel('C:/Users/Data/try.xlsx', sheetname='Sheet1')
print(x)
In your code df is a class object. To create an empty data frame you need to instantiate it. Instantiating classes in Python uses function notation. Also, we don't need to pass the default parameters when we read the excel file. This will help the code look cleaner.
Also, we don't need to pass the default parameters when we read the excel file. This will help the code look cleaner.
from pandas import DataFrame as df
class Data:
x = df()
#staticmethod
def import_File(df_name, file):
df_name = pd.io.excel.read_excel(file.replace('"',''), sheetname='Sheet1')
When you pass Data.x to import_File(), df_name will refer to the same object as Data.x, which in this case is an empty dataframe. However, when you assign pd.io.excel.read_excel(file) to df_name then the connection between df_name and the empty dataframe is broken, and df_name now refers to the excel dataframe. Data.x has undergone no change during this process so it is still connected to for the empty data frame object.
A simpler way to see this with strings:
x = 'red'
df_name = x
We can break the df_name connection between string object 'red' and form a new one with object 'excel`.
df_name = 'excel'
print(x)
'red'
However, there's a simple fix for Data.x to return the excel dataframe.
from pandas import DataFrame as df
class Data:
x = df()
#staticmethod
def import_File(file):
Data.x = pd.io.excel.read_excel(file.replace('"',''), sheetname='Sheet1')
def inputdata():
Data.import_File(r"C:\Users\Data\try.xlsx")
print(Data.x)
However, I don't recommend using staticmethods, and you should include a constructor in your class as the other answer has recommended.

Categories

Resources