How to get a Series's parent DataFrame in Pandas? - python

Does a pandas.Series instance do know it's parent pandas.DataFrame when it comes from there?
Example:
import pandas
df = pandas.DataFrame({'col': range(10)})
series_column = df.col
print('My parent is {}'.format(series_column.parent))
# or
print('My parent is {}'.format(df.col.parent))
My goal is to make the signature of a method easier.
def foobar(data: DataFrame, column: str):
return data[column].do_something()
# I would like to save one argument
def foobar(column: pandas.Series):
return column.parent[column].do_something()
Here is a more real world example:
def frequency(data: pandas.DataFrame, column: str, dropna: bool = False):
tab = data[column].value_counts(dropna=dropna)
# sort index if it is an ordered category
if data[column].dtype.name == 'category':
if data[column].cat.ordered:
tab = tab.sort_index()
# Series to DataFrame
tab = tab.to_frame()
# two column MultiIndex
a = random_label()
tab[a] = column
tab = tab.reset_index()
tab = tab.set_index([a, column])
tab.index.names = (None, None)
tab.columns = ['n']
return tab

Related

Python, Streamlit AgGrid add new row to AgGrid Table

I am trying to add a new row to an AgGrid Table using streamlit and python
At this point, I just want to add 1 or more new rows to the table generated by the AgGrid by pressing the "add row" button.
After pressing the "add row" button I generate a second table with the new row mistakenly, so I get 2 data-tables instead of updating the main table.
The initial data df = get_data() is been gathered from a SQL query. I want to add a new row and (for now) save it into a CSV file or at least get the updated DF with the new row added as an output and graph it
My current code
import streamlit as st
from metrics.get_metrics import get_data
from metrics.config import PATH_SAMPLES
filename: str = 'updated_sample.csv'
save_path = PATH_SAMPLES.joinpath(filename)
def generate_agrid(data: pd.DataFrame):
gb = GridOptionsBuilder.from_dataframe(data)
gb.configure_default_column(editable=True) # Make columns editable
gb.configure_pagination(paginationAutoPageSize=True) # Add pagination
gb.configure_side_bar() # Add a sidebar
gb.configure_selection('multiple', use_checkbox=True,
groupSelectsChildren="Group checkbox select children") # Enable multi-row selection
gridOptions = gb.build()
grid_response = AgGrid(
data,
gridOptions=gridOptions,
data_return_mode=DataReturnMode.AS_INPUT,
update_on='MANUAL', # <- Should it let me update before returning?
fit_columns_on_grid_load=False,
theme=AgGridTheme.STREAMLIT, # Add theme color to the table
enable_enterprise_modules=True,
height=350,
width='100%',
reload_data=True
)
data = grid_response['data']
selected = grid_response['selected_rows']
df = pd.DataFrame(selected) # Pass the selected rows to a new dataframe df
return grid_response
def onAddRow(grid_table):
df = pd.DataFrame(grid_table['data'])
column_fillers = {
column: (False if df.dtypes[column] == "BooleanDtype"
else 0 if df.dtypes[column] == "dtype('float64')"
else '' if df.dtypes[column] == "string[python]"
else datetime.datetime.utcnow() if df.dtypes[column] == "dtype('<M8[ns]')"
else '')
for column in df.columns
}
data = [column_fillers]
df_empty = pd.DataFrame(data, columns=df.columns)
df = pd.concat([df, df_empty], axis=0, ignore_index=True)
grid_table = generate_agrid(df)
return grid_table
# First data gather
df = get_data()
if __name__ == '__main__':
# Start graphing
grid_table = generate_agrid(df)
# add row
st.sidebar.button("Add row", on_click=onAddRow, args=[grid_table])
Here is a sample minimal code.
import streamlit as st
import pandas as pd
from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode
def generate_agrid(df):
gb = GridOptionsBuilder.from_dataframe(df)
gb.configure_selection(selection_mode="multiple", use_checkbox=True)
gridoptions = gb.build()
grid_response = AgGrid(
df,
height=200,
gridOptions=gridoptions,
update_mode=GridUpdateMode.MANUAL
)
selected = grid_response['selected_rows']
# Show the selected row.
if selected:
st.write('selected')
st.dataframe(selected)
return grid_response
def add_row(grid_table):
df = pd.DataFrame(grid_table['data'])
new_row = [['', 100]]
df_empty = pd.DataFrame(new_row, columns=df.columns)
df = pd.concat([df, df_empty], axis=0, ignore_index=True)
# Save new df to sample.csv.
df.to_csv('sample.csv', index=False)
def get_data():
"""Reads sample.csv and return a dataframe."""
return pd.read_csv('sample.csv')
if __name__ == '__main__':
df = get_data()
grid_response = generate_agrid(df)
st.sidebar.button("Add row", on_click=add_row, args=[grid_response])
Initial output
Output after pressing add row
sample.csv
team,points
Lakers,120
Celtics,130

Is there a way to store output dataframes and appending them to the last output in the same dataframe

I am trying to fetch data from API for 50 parcels. I want them to be in a single data frame. While running this loop the data frame is storing only the last parcel which is satisfying the loop condition. Is there any way to store all the previous outputs also in the same dataframe.
For e.g upon running this code it only returns the data frame for foreign id=50, I want the dataframe for all 1-50.
import requests
import pandas as pd
foreign=1
while (foreign <=50):
s1_time_series_url_p6 = 'https://demodev2.kappazeta.ee/ard_api_demo/v1/time_series/s1?limit_to_rasters=true&parcel_foreign_id=0&properties=parcel_foreign_id%2Cs1product_end_time%2Cs1product_ron%2Ccohvh_avg%2Ccohvv_avg%2Cvhvv_avg'
s2_time_series_url_p6 = 'https://demodev2.kappazeta.ee/ard_api_demo/v1/time_series/s2?limit_to_rasters=true&parcel_foreign_id=0&properties=parcel_foreign_id%2Cs2product_start_time%2Cs2product_ron%2Cndvi_avg'
position = 101
foreign_n=str(foreign)
s1_time_series_url_p6 = s1_time_series_url_p6[:position] + foreign_n + s1_time_series_url_p6[position+1:]
s2_time_series_url_p6 = s2_time_series_url_p6[:position] + foreign_n + s2_time_series_url_p6[position+1:]
r_s1_time_series_p6 = requests.get(s1_time_series_url_p6)
r_s2_time_series_p6 = requests.get(s2_time_series_url_p6)
json_s1_time_series_p6 = r_s1_time_series_p6.json()
json_s2_time_series_p6 = r_s2_time_series_p6.json()
df_s1_time_series_p6 = pd.DataFrame(json_s1_time_series_p6['s1_time_series'])
df_s2_time_series_p6 = pd.DataFrame(json_s2_time_series_p6['s2_time_series'])
df_s2_time_series_p6.s2product_start_time=df_s2_time_series_p6.s2product_start_time.str[0:11]
df_s1_time_series_p6.s1product_end_time=df_s1_time_series_p6.s1product_end_time.str[0:11]
dfinal_p6 = df_s1_time_series_p6.merge(df_s2_time_series_p6, how='inner', left_on='s1product_end_time', right_on='s2product_start_time')
cols_p6 = ['parcel_foreign_id_x', 's1product_ron','parcel_foreign_id_y','s2product_ron']
dfinal_p6[cols_p6] = dfinal_p6[cols_p6].apply(pd.to_numeric, errors='coerce', axis=1)
dfinal_p6
The issue is resolved by first creating an empty data frame and then appending the outputs in the dataframe within the loop.
The updated code is as follows:
column_names = ["parcel_foreign_id_x", "s1product_end_time", "s1product_ron","cohvh_avg", "cohvv_avg", "vhvv_avg","parcel_foreign_id_y", "s2product_start_time", "s2product_ron", "ndvi_avg" ]
df = pd.DataFrame(columns = column_names)
foreign=1
while (foreign <=50):
s1_time_series_url_p6 = 'https://demodev2.kappazeta.ee/ard_api_demo/v1/time_series/s1?limit_to_rasters=true&parcel_foreign_id=0&properties=parcel_foreign_id%2Cs1product_end_time%2Cs1product_ron%2Ccohvh_avg%2Ccohvv_avg%2Cvhvv_avg'
s2_time_series_url_p6 = 'https://demodev2.kappazeta.ee/ard_api_demo/v1/time_series/s2?limit_to_rasters=true&parcel_foreign_id=0&properties=parcel_foreign_id%2Cs2product_start_time%2Cs2product_ron%2Cndvi_avg'
position = 101
foreign_n=str(foreign)
s1_time_series_url_p6 = s1_time_series_url_p6[:position] + foreign_n + s1_time_series_url_p6[position+1:]
s2_time_series_url_p6 = s2_time_series_url_p6[:position] + foreign_n + s2_time_series_url_p6[position+1:]
r_s1_time_series_p6 = requests.get(s1_time_series_url_p6)
r_s2_time_series_p6 = requests.get(s2_time_series_url_p6)
json_s1_time_series_p6 = r_s1_time_series_p6.json()
json_s2_time_series_p6 = r_s2_time_series_p6.json()
df_s1_time_series_p6 = pd.DataFrame(json_s1_time_series_p6['s1_time_series'])
df_s2_time_series_p6 = pd.DataFrame(json_s2_time_series_p6['s2_time_series'])
df_s2_time_series_p6.s2product_start_time=df_s2_time_series_p6.s2product_start_time.str[0:11]
df_s1_time_series_p6.s1product_end_time=df_s1_time_series_p6.s1product_end_time.str[0:11]
dfinal_p6 = df_s1_time_series_p6.merge(df_s2_time_series_p6, how='inner', left_on='s1product_end_time', right_on='s2product_start_time')
cols_p6 = ['parcel_foreign_id_x', 's1product_ron','parcel_foreign_id_y','s2product_ron']
dfinal_p6[cols_p6] = dfinal_p6[cols_p6].apply(pd.to_numeric, errors='coerce', axis=1)
df = pd.concat([dfinal_p6,df],ignore_index = True)
foreign = foreign+1

Class confusion containing pandas concat

when i use this class i dont get the transformed dataframe
but the old one
the class instance does not tansfrom the dataframe given as a parameter
class DataPreparation:
def __init__(
self,
df_train: pd.DataFrame,
df_test: pd.DataFrame
):
self.df_train = df_train
self.df_test = df_test
self.add_embarked()
def add_embarked(self):
all_embarked = pd.concat([self.df_train.Embarked, self.df_test.Embarked])
most_commont_value_emb= all_embarked.mode()[0]
self.df_train.Embarked.fillna(most_commont_value_emb, inplace=True)
self.df_test.Embarked.fillna(most_commont_value_emb, inplace=True)
self.onehotencoder_labels("Embarked")
def onehotencoder_labels(self, column: str):
one_hot_encoder_train = pd.get_dummies(self.df_train[column])
one_hot_encoder_test = pd.get_dummies(self.df_test[column])
self.df_train = pd.concat([self.df_train, one_hot_encoder_train], axis=1)
self.df_test = pd.concat([self.df_test, one_hot_encoder_test], axis=1)

How to fix IndexError: Too many levels: Index has only 1 level, not 2 in Python

I have written the following code that has a function model_data to perform a particular set of tasks. I have to pass the list of Badges and the type of category 1 or 2 along with an empty dataframe data.
But while running the code I am getting an error. I searched SO for answers but this type of Question was not found.
CODE
#Model Function
def model_data(badge_list, data):
for key, value in badge_list.items():
#Check for Post Type
if (value == 1):
badge_type = posts.loc[posts.PostTypeId == '1']
elif (value == 2):
badge_type = posts.loc[posts.PostTypeId == '2']
#Obtain required fields from Badge Data
badge_type = badge_type[['OwnerUserId', 'Id','Score', 'CreationDate']]
badge_type.columns = ['UserId', 'Id', 'Score','CreationDate']
Badge = key
#Obtain time when user first obtained Badge
badge_data = user_badge_dt(Badge)
#Find the number of posts made before and after 1 week of Badge Attainment
post_data = post_details(df1 = badge_data, df2 = badge_type)
post_data.date = pd.to_datetime(post_data.date)
#Calculate APR
post_data = APR(post_data)
#Calculate Score
post_data = score(df = post_data, post_type = badge_type)
#Generate Final Dataframe with Badge Count
data1 = badge_number(post_data)
data1 = data1[['1','2','3','date','Score','APR']]
#Append Dataframe
data = data.append(data1)
return data
#Function Call
questionBadge_list = {'Good Question':1, 'Explainer':2}
data = pd.DataFrame()
badge1_data = model_data(badge_list = questionBadge_list, data = data)
ERROR
IndexError: Too many levels: Index has only 1 level, not 2
ERROR LINE
The code line badge_data = user_badge_dt(Badge) gives this error so I am adding the complete function.
#Function to obtain UserId with the date-time of obtaining given badge for the first time
def user_badge_dt(badge):
#Creating DataFrame to obtain all UserId and date-Time of given badge
df = badges[['UserId','Date']].loc[badges.Name == badge]
#Obtaining the first date-time of badge attainment
v = df.groupby("UserId", group_keys=False)['Date'].nsmallest(1)
v.index = v.index.droplevel(1)
df['date'] = df['UserId'].map(v)
df.drop(columns='Date',inplace=True)
#Removing all duplicate values of Users
df.drop_duplicates(subset='UserId', inplace=True )
return df

Function erroring out when calling another function

I'm getting the following error when calling a function from another function:
TypeError: 'GLMResultsWrapper' object is not callable
I get the error at the coeffs = model_results(model_results) line below.
This is another function that runs error free outside of the table_to_graph function. The model_results function takes the summary output from a statsmodel model and puts it into a data frame.
The table_to_graph function joins that dataframe to another table that is the df in the input. table_to_graph function below.
The ultimate function is the following:
# Add into table generation table
def table_to_graph(model_results, df):
'''
#function that combines rating tables and model summary
'''
coeffs = model_results(model_results)
try:
df['key'] = df['variable']+"_"+df['level']
df = pd.merge(df, coeffs, left_on = 'key', right_on = 'index', how = 'left')
df['factor'] = np.exp(df[factor])
df['factor'].fillna(1, inplace = True)
df['error_up'] = np.exp(df[error_up])
df['error_down'] = np.exp(df[error_down])
#title2 = title1
df = df[['model', 'variable', 'level', 'total_incurred', 'total_count', 'cmeu', 'factor', 'error_up', 'error_down'
, 'pricing_model_1_p_values']]
return df
#df1 = df1.append(df)
except:
#df['level'] = df['level'].astype('str')
df['key'] = df['variable']+"_"+df['level'].astype('str')
df['level'] = df['level'].astype('int')
df = pd.merge(df, coeffs, left_on = 'key', right_on = 'index', how = 'left')
df['factor'] = np.exp(df[factor])
df['factor'].fillna(1, inplace = True)
df['error_up'] = np.exp(df[error_up])
df['error_down'] = np.exp(df[error_down])
df = df[['model', 'variable', 'level', 'total_incurred', 'total_count', 'cmeu', 'factor', 'error_up'
, 'error_down', 'pricing_model_1_p_values']]
#df1 = df1.append(df)
return df
model_results function below:
def model_results(model_results):
'''
function that puts model parameters into a data frame
'''
df = pd.DataFrame(model_results.params, columns = ['factor'])
df['error_down'] = model_results.conf_int()[0]
df['error_up'] = model_results.conf_int()[1]
df['standard_error'] = model_results.bse
df['pvalues'] = round(model_results.pvalues, 3)
df.reset_index(inplace = True)
return df
The problem is that you are not calling the function you have defined as model_results but instead are "calling" the model_results data on the model_results data. This is why you get the error that the object is not callable.
Change either the function name or the name of the model_results data to something else, this will allow python to make a distinction between the two and do what you want it to do. Which is call the function model_results on the model_results data.

Categories

Resources