Class confusion containing pandas concat - python

when i use this class i dont get the transformed dataframe
but the old one
the class instance does not tansfrom the dataframe given as a parameter
class DataPreparation:
def __init__(
self,
df_train: pd.DataFrame,
df_test: pd.DataFrame
):
self.df_train = df_train
self.df_test = df_test
self.add_embarked()
def add_embarked(self):
all_embarked = pd.concat([self.df_train.Embarked, self.df_test.Embarked])
most_commont_value_emb= all_embarked.mode()[0]
self.df_train.Embarked.fillna(most_commont_value_emb, inplace=True)
self.df_test.Embarked.fillna(most_commont_value_emb, inplace=True)
self.onehotencoder_labels("Embarked")
def onehotencoder_labels(self, column: str):
one_hot_encoder_train = pd.get_dummies(self.df_train[column])
one_hot_encoder_test = pd.get_dummies(self.df_test[column])
self.df_train = pd.concat([self.df_train, one_hot_encoder_train], axis=1)
self.df_test = pd.concat([self.df_test, one_hot_encoder_test], axis=1)

Related

How to get a Series's parent DataFrame in Pandas?

Does a pandas.Series instance do know it's parent pandas.DataFrame when it comes from there?
Example:
import pandas
df = pandas.DataFrame({'col': range(10)})
series_column = df.col
print('My parent is {}'.format(series_column.parent))
# or
print('My parent is {}'.format(df.col.parent))
My goal is to make the signature of a method easier.
def foobar(data: DataFrame, column: str):
return data[column].do_something()
# I would like to save one argument
def foobar(column: pandas.Series):
return column.parent[column].do_something()
Here is a more real world example:
def frequency(data: pandas.DataFrame, column: str, dropna: bool = False):
tab = data[column].value_counts(dropna=dropna)
# sort index if it is an ordered category
if data[column].dtype.name == 'category':
if data[column].cat.ordered:
tab = tab.sort_index()
# Series to DataFrame
tab = tab.to_frame()
# two column MultiIndex
a = random_label()
tab[a] = column
tab = tab.reset_index()
tab = tab.set_index([a, column])
tab.index.names = (None, None)
tab.columns = ['n']
return tab

How to have documentation and type hints in vs code from methods created with pandas register_dataframe_accessor

I ran into some issues while trying to extend pandas dataframe API using the register_dataframe_accessor decorator. While I was able to create the class as indicated in the documentation, and use properties and methods, I noticed that in VS Code, the doctrings and type hints were not available.
In contrast, while working inside the classic jupyter notebook or jupyter-lab,both docstrings and type hints were available.
I did the same test with type-hints in a dummy method, and again it worked in Jupyter but not in VS Code Jupyter.
Is it possible to make documentation and type hints available in VS Code for additional dataframe accessor ?
#pd.api.extensions.register_dataframe_accessor("geo")
class GeoAccessor:
def __init__(self, pandas_obj):
self._validate(pandas_obj)
self._obj = pandas_obj
#staticmethod
def _validate(obj):
# verify there is a column latitude and a column longitude
if "latitude" not in obj.columns or "longitude" not in obj.columns:
raise AttributeError("Must have 'latitude' and 'longitude'.")
#property
def center(self):
# return the geographic center point of this DataFrame
lat = self._obj.latitude
lon = self._obj.longitude
return (float(lon.mean()), float(lat.mean()))
def plot(self, dataframe: pd.DataFrame) -> pd.DataFrame:
"""With docstring
plot this array's data on a map, e.g., using Cartopy
"""
return dataframe
ds = pd.DataFrame(
{"longitude": np.linspace(0, 10), "latitude": np.linspace(0, 20)}
)
ds.geo.plot()
This problem has been solved in this discuss.
It's need add TYPE_CHECKING:
if TYPE_CHECKING:
class DataFrame(pd.DataFrame):
geo: GeoAccessor
you have to do this every time you transform the DataFrame.
ds: 'DataFrame' = pd.DataFrame()
After modification
from typing import TYPE_CHECKING
import pandas as pd
import numpy as np
#pd.api.extensions.register_dataframe_accessor("geo")
class GeoAccessor:
def __init__(self, pandas_obj):
self._validate(pandas_obj)
self._obj = pandas_obj
#staticmethod
def _validate(obj):
# verify there is a column latitude and a column longitude
if "latitude" not in obj.columns or "longitude" not in obj.columns:
raise AttributeError("Must have 'latitude' and 'longitude'.")
#property
def center(self):
# return the geographic center point of this DataFrame
lat = self._obj.latitude
lon = self._obj.longitude
return (float(lon.mean()), float(lat.mean()))
def plot(self, dataframe: pd.DataFrame) -> pd.DataFrame:
"""_summary_
Args:
dataframe (pd.DataFrame): _description_
Returns:
pd.DataFrame: _description_
"""
return dataframe
def main():
if TYPE_CHECKING:
class DataFrame(pd.DataFrame):
geo: GeoAccessor
ds: 'DataFrame' = pd.DataFrame(
# ^ you have to do this every time you transform the DataFrame
{"longitude": np.linspace(0, 10), "latitude": np.linspace(0, 20)}
)
ds.geo.plot()
# ^ autocomplete is now provided
if __name__ == "__main__":
main()
enter image description here

How do I do a time series analysis as part of running a function in Python 3.7?

I have this:
formulas_count_stats.py:
import pandas as pd
from df_count_stats import df, df1
df = df
df1 = df1
class Data_load_compare_0:
def __init__(self, df):
self.df = pd.read_csv(df, delimiter=';')
'''
Data information section from df = basic stats
'''
def get_EDA_columns(self):
return self.df.columns
def get_EDA_info(self):
return self.df.info()
def get_EDA_describe(self):
return self.df.describe()
def get_EDA_shape(self):
return self.df.shape
def get_EDA_value_counts(self):
return self.df.value_counts()
def get_EDA_isnull(self):
return self.df.isnull()
def get_EDA_dtypes(self):
return self.df.dtypes
def get_EDA_isna(self):
return self.df.isna()
def get_EDA_nunique(self):
return self.df.nunique()
def get_EDA_sort_dipl(self):
return self.df.query("col1 == 'X'")
def get_EDA_sort_bach(self):
return self.df.query("col1 == 'Y'")
def get_EDA_sort_by_line(self):
return self.df.groupby(['col2', 'col1', 'col3']).agg(['count'])# groupby(['User Name', 'col2'])['col1'].size().reset_index(name='counts')
'''
Time series
'''
import matplotlib.pyplot as plt
def get_time_series(self):
df['Logon Time'] = pd.to_datetime(df['Logon Time'], errors='coerce')
df['Year'] = df.index.dt.year
df['month'] = df.index.dt.month
df['day'] = df.inde.dt.day
df['hour'] = df.index.dt.hour
df['week'] = df.index.dt.week
df['count'] = df['User Name']
return df.groupby([df['Logon Time'].dt.year, df['Logon Time'].dt.month]).sum().plot.bar()
plt.show()
...and running the functions from main_count_stats.py (loading data from df_count_stats.py):
from df_count_stats import df_load, df1_load
from formulas_count_stats import Data_load_compare_0, Data_load_compare_1
myData = Data_load_compare_0(df_load)
myData1 = Data_load_compare_1(df1_load)
EDA_stats_00_0 = myData.get_EDA_columns()
EDA_stats_01_0 = myData.get_EDA_nunique()
EDA_stats_02_0 = myData.get_EDA_shape()
EDA_stats_03_0 = myData.get_EDA_info()
EDA_stats_04_0 = myData.get_EDA_isna()
EDA_stats_05_0 = myData.get_EDA_isnull()
EDA_stats_06_0 = myData.get_EDA_describe()
EDA_stats_07_0 = myData.get_EDA_dtypes()
EDA_stats_08_0 = myData.get_EDA_sort_bach()
EDA_stats_09_0 = myData.get_EDA_sort_dipl()
EDA_stats_10_0 = myData.get_EDA_sort_by_line()
EDA_stats_11_0 = myData.get_time_series()
I get this error:
Traceback (most recent call last):
File "C:/.../.../main_count_stats.py", line 25, in <module>
EDA_stats_11_0 = myData.get_time_series()
File "C:\...\...\...\formulas_count_stats.py", line 59, in get_time_series
df['Year'] = df.index.dt.year
AttributeError: 'RangeIndex' object has no attribute 'dt'
I hoped that my attempt to integrate a simple time series analysis as part of an otherwise well-functioning formulas_count_stats.py would work. Obviously, it does not. I did change the index to a 'to_datetime' format.
How can I solve this?
or you can convert the index to Series and use .dt:
def get_time_series(self):
self.df['Logon Time'] = pd.to_datetime(df['Logon Time'], errors='coerce')
self.df[['Year', 'month', 'day', 'hour', 'week']] = (pd.Series(df.index)
.dt.strftime('%Y-%m-%d-%H-%W')
.str.split('-', expand=True).astype(int)).values
self.df['count'] = df['User Name']
return self.df.groupby([df['Logon Time'].dt.year, df['Logon Time'].dt.month]).sum().plot.bar()

python class empty dataframe

The purpose of this script is:
• Read a group of csv files.
• Scrape the date and extract some features out of it.
• Merge these csv files into a single data frame.
• Import the final data frame into another class and print it.
Here is the code:
import pandas as pd
import os
class DataSource:
def __init__(self):
self.dfs = []
self.final = pd.DataFrame()
self.names = ['Date', 'Time', 'open', 'high', 'low', 'close', 'Volume']
self.directory = os.chdir(r"C:\Users\Sayed\Desktop\forex")
def merge(self):
for file in os.listdir(self.directory):
df = pd.read_csv(file, names=self.names,
parse_dates={'Release Date': ['Date', 'Time']})
self.dfs.append(df)
self.final = pd.concat(self.dfs, axis=0)
self.final = self.final[['Release Date', 'open', 'high', 'low', 'close']]
print(self.final.head())
return self.final
class test():
def __init__(self):
self.df = DataSource().final
def print(self):
return print(self.df)
x = test()
x.print()
Here is the output:
Empty DataFrame
Columns: []
Index: []
You should call function to create something. Try this one.
class test():
def __init__(self):
self.df = DataSource(). merge()

Function erroring out when calling another function

I'm getting the following error when calling a function from another function:
TypeError: 'GLMResultsWrapper' object is not callable
I get the error at the coeffs = model_results(model_results) line below.
This is another function that runs error free outside of the table_to_graph function. The model_results function takes the summary output from a statsmodel model and puts it into a data frame.
The table_to_graph function joins that dataframe to another table that is the df in the input. table_to_graph function below.
The ultimate function is the following:
# Add into table generation table
def table_to_graph(model_results, df):
'''
#function that combines rating tables and model summary
'''
coeffs = model_results(model_results)
try:
df['key'] = df['variable']+"_"+df['level']
df = pd.merge(df, coeffs, left_on = 'key', right_on = 'index', how = 'left')
df['factor'] = np.exp(df[factor])
df['factor'].fillna(1, inplace = True)
df['error_up'] = np.exp(df[error_up])
df['error_down'] = np.exp(df[error_down])
#title2 = title1
df = df[['model', 'variable', 'level', 'total_incurred', 'total_count', 'cmeu', 'factor', 'error_up', 'error_down'
, 'pricing_model_1_p_values']]
return df
#df1 = df1.append(df)
except:
#df['level'] = df['level'].astype('str')
df['key'] = df['variable']+"_"+df['level'].astype('str')
df['level'] = df['level'].astype('int')
df = pd.merge(df, coeffs, left_on = 'key', right_on = 'index', how = 'left')
df['factor'] = np.exp(df[factor])
df['factor'].fillna(1, inplace = True)
df['error_up'] = np.exp(df[error_up])
df['error_down'] = np.exp(df[error_down])
df = df[['model', 'variable', 'level', 'total_incurred', 'total_count', 'cmeu', 'factor', 'error_up'
, 'error_down', 'pricing_model_1_p_values']]
#df1 = df1.append(df)
return df
model_results function below:
def model_results(model_results):
'''
function that puts model parameters into a data frame
'''
df = pd.DataFrame(model_results.params, columns = ['factor'])
df['error_down'] = model_results.conf_int()[0]
df['error_up'] = model_results.conf_int()[1]
df['standard_error'] = model_results.bse
df['pvalues'] = round(model_results.pvalues, 3)
df.reset_index(inplace = True)
return df
The problem is that you are not calling the function you have defined as model_results but instead are "calling" the model_results data on the model_results data. This is why you get the error that the object is not callable.
Change either the function name or the name of the model_results data to something else, this will allow python to make a distinction between the two and do what you want it to do. Which is call the function model_results on the model_results data.

Categories

Resources