pass data frame from one function to another in python - python

I am using two functions, one to load data and another to get a summary of the same data. However in second function analyze() I get the error df not defined. How do I pass df from loader() to analyze() ?
from xlwings import Workbook, Range
import pandas as pd
def Loader():
wb = Workbook.caller()
file_path = Range(1,(1,1)).value
file=pd.read_excel(file_path, sheetname='Sheet1')
df = pd.DataFrame(file)
def analyze():
Range('C1').value=df.describe()

With several ways depending on what you want to do. The simplest way is to return the df from the Loader() and then give it to the analyze() as an argument:
def Loader():
wb = Workbook.caller()
file_path = Range(1,(1,1)).value
file=pd.read_excel(file_path, sheetname='Sheet1')
df = pd.DataFrame(file)
return df
def analyze(df):
Range('C1').value=df.describe()
# Use it this way
dataFrame = Loader()
analyze(dataframe)
Then another way is to have a Loader class like this:
class Loader(object):
def __init__(self):
wb = Workbook.caller()
file_path = Range(1,(1,1)).value
file=pd.read_excel(file_path, sheetname='Sheet1')
self.df = pd.DataFrame(file)
# 1) If you want to analyse when you create the object
# call analyze() here
self.analyze()
def analyze(self):
Range('C1').value=self.df.describe()
loader = Loader()
# 2) Otherwise you can keep control of analyze()
# and call it whenever you want, like this:
loader.analyze()
Of course there are other ways too (like having a global variable for the df).

Related

Python streamlit: Updated cells jump to default using streamlit

I try to find a solution for the following issue.
I would like to upload an excel sheet, consisting of multiple sheets (use case here 2). Afterwards I added tabs via Streamlit and used the aggrid component to be able to change some cells. However if I change cells in Sheet 1 and jump to tab 2 and back, changes are gone. This is not the desired output, meaning that any changes done in the cell should remain.
I tried via st.cache and st.experimental_memo however without success.
My code is below
import numpy as np
import streamlit as st
import pandas as pd
from st_aggrid import GridOptionsBuilder, AgGrid, GridUpdateMode, DataReturnMode, JsCode,GridOptionsBuilder
excelfile=st.sidebar.file_uploader("Select Excel-File for cleansing",key="Raw_Data")
if excelfile==None:
st.balloons()
tab1, tab2 = st.tabs(["Sheet 1", "Sheet 2"])
#st.cache()
def load_sheet1():
sheet1=pd.read_excel(excelfile,sheet_name="Sheet1")
return sheet1
#st.cache()
def load_sheet2():
sheet1=pd.read_excel(excelfile,sheet_name="Sheet2")
return sheet1
df=load_sheet1()
with tab1:
gd = GridOptionsBuilder.from_dataframe(df)
gd.configure_pagination(enabled=True)
gd.configure_default_column(editable=True, groupable=True)
gd.configure_selection(selection_mode="multiple", use_checkbox=True)
gridoptions = gd.build()
grid_table = AgGrid(
df,
gridOptions=gridoptions,
update_mode=GridUpdateMode.SELECTION_CHANGED,
theme="material",
)
df1=load_sheet2()
with tab2:
gd = GridOptionsBuilder.from_dataframe(df1)
gd.configure_pagination(enabled=True)
gd.configure_default_column(editable=True, groupable=True)
gd.configure_selection(selection_mode="multiple", use_checkbox=True)
gridoptions = gd.build()
grid_table = AgGrid(
df1,
gridOptions=gridoptions,
update_mode=GridUpdateMode.SELECTION_CHANGED,
theme="material",
)
I also can share with you my test excel file:
Sheet 1
Col1
Col2
A
C
B
D
Sheet 2
Col3
Col4
E
G
F
H
Any kind of support how to eliminate this issue would be more than awesome
EDIT: Here is a solution without the load button.
I couldn't find a way to do it without adding a button to reload the page to apply changes. Since streamlit reruns the whole code every time you interact with it is a bit tricky to rendre elements the right way. Here is your code refactored. Hope this helps !
import streamlit as st
import pandas as pd
from st_aggrid import AgGrid, GridUpdateMode, GridOptionsBuilder
# Use session_state to keep stack of changes
if 'df' not in st.session_state:
st.session_state.df = pd.DataFrame()
if 'df1' not in st.session_state:
st.session_state.df1 = pd.DataFrame()
if 'excelfile' not in st.session_state:
st.session_state.excelfile = None
#st.cache()
def load_sheet1():
sheet1 = pd.read_excel(excelfile, sheet_name="Sheet1")
return sheet1
#st.cache()
def load_sheet2():
sheet1 = pd.read_excel(excelfile, sheet_name="Sheet2")
return sheet1
def show_table(data):
if not data.empty:
gd = GridOptionsBuilder.from_dataframe(data)
gd.configure_pagination(enabled=True)
gd.configure_default_column(editable=True, groupable=True)
gd.configure_selection(selection_mode="multiple", use_checkbox=True)
gridoptions = gd.build()
grid_table = AgGrid(
data,
gridOptions=gridoptions,
# Use MODEL_CHANGED instead of SELECTION_CHANGED
update_mode=GridUpdateMode.MODEL_CHANGED,
theme="material"
)
# Get the edited table when you make changes and return it
edited_df = grid_table['data']
return edited_df
else:
return pd.DataFrame()
excelfile = st.sidebar.file_uploader("Select Excel-File for cleansing", key="Raw_Data")
if st.session_state.excelfile != excelfile:
st.session_state.excelfile = excelfile
try:
st.session_state.df = load_sheet1()
st.session_state.df1 = load_sheet2()
except:
st.session_state.df = pd.DataFrame()
st.session_state.df1 = pd.DataFrame()
tab1, tab2 = st.tabs(["Sheet 1", "Sheet 2"])
with tab1:
# Get the edited DataFrame from the ag grid object
df = show_table(st.session_state.df)
with tab2:
# Same thing here...
df1 = show_table(st.session_state.df1)
# Then you need to click on a button to make the apply changes and
# reload the page before you go to the next tab
if st.button('Apply changes'):
# Store new edited DataFrames in session state
st.session_state.df = df
st.session_state.df1 = df1
# Rerun the page so that changes apply and new DataFrames are rendered
st.experimental_rerun()
After loading your file and making your changes in the first tab hit the "apply changes" button to reload the page before moving to the second tab.

Constructor __init__ is written to take two position arguments, but when used, reports the error that only 1 is allowed

I have a class and constructor I'm working on, and I added an extra parameter to __init__, and now I get the error, TypeError: FeatureDataset() takes 1 positional argument but 2 were given.
I wonder why. It seems to me that it should accept two arguments. It's an incomplete function, but I'd like to get past this constructor argument number error. I have checked several answers and either they were about something specifically different, or indentation, and I have neither of those issues (4 indents per new indentation divide).
def FeatureDataset(Dataset):
def __init__(self, root_dir, file_name):
#load csv
self.file_out = pd.read_csv(file_name)
self.root_dir = root_dir
self.labels = self.file_out.iloc[1:160, 0].values
self.features = self.file_out.iloc[1:160, 1:].values
#Feature Scaling
sc = StandardScaler()
label_train = self.labels
feature_train = self.features #sc.fit_transform(features)
#Convert to torch tensors
self.feature_train = torch.tensor(label_train, dtype = torch.float32)
self.label_train = torch.tensor(label_train)
file_name = "data.csv"
root_dir = "archive"
feature_set = FeatureDataset(root_dir, file_name)
This defines a function, not a class:
def FeatureDataset(Dataset):
... try ...
class FeatureDataset(Dataset):

Dataiku : Job failed: Error in Python process: At line 158: <class 'NameError'>

I work on Dataiku and I have a jupyter notebook which is work and now I want to include this on python recipe. The objective is to write dataframe pandas in a dataset.
data_f is the name of my dataframe and output_gen_python is the name of my dataset in dataiku.
I have this error :
Job failed: Error in Python process: At line 158: <class 'NameError'>: name 'data_df' is not defined
Here is my code :
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
from datetime import datetime, timedelta
# Read recipe inputs
batches_types_copy = dataiku.Dataset("batches_types_copy")
batches_types_copy_df = batches_types_copy.get_dataframe()
Last_hour_extract = dataiku.Dataset("Last_hour_extract")
last_hour_extract_df = Last_hour_extract.get_dataframe()
class OutputMode(object):
...
class IDCalculation_I:
def _preGenerateID(self,outputMode,data_df):
...
def generateID(self,outputMode,data_df):
pass
class IDCase1(IDCalculation_I):
def generateID(self,outputMode,data_df):
...
return data_df
class IDCase2(IDCalculation_I):
def generateID(self,outputMode,data_df):
...
return data_df
class Fingerprinter(object):
def __init__(self,outputMode):
self._outputMode = outputMode
def _generateID(self,data_df):
return self._outputMode.getCaseID().generateID(self._outputMode,data_df)
def run(self,data_df):
# GenerateID
data_df = self._generateID(data_df)
return data_df
def __str__(self):
return str(self._outputMode)
outputMode = OutputMode('EEA','06:00:00','08:00:00',pytz.timezone('Europe/Paris'),CONST_MODE_CONT,IDCase1())
fp_calculator = Fingerprinter(outputMode)
output_gen_python_df = data_df # Compute a Pandas dataframe to write into output_gen_python
# Write recipe outputs
output_gen_python = dataiku.Dataset("output_gen_python")
output_gen_python.write_with_schema(output_gen_python_df)
The error says it all, on line 52 you are attempting to assign output_gen_python_df to data_df.
How to fix it?
Since you've initiated the variable fp_calculator with your Fingerprinter class in it the way you fix your error is:
data_df = fp_calculator.run()
which will return your dataframe

How do I import various Data Frames from different python file?

I have a python file called 'clean_data.py' which has all the data frames I need, and I want to import them for use in another python file called 'main.py' to use in creating a dashboard.
Is it possible to create a class in my clean_data.py, and if so can someone direct me to an article (which I struggled to find so far) so that I can figure it out?
The aim is to shift from CSV to an API overtime, so I wanted to keep data side wrangling side of things in a different file while the web app components in the main.py file.
Any help would be much appreciated.
The code from the clean_data.py is:
import pandas as pd
import csv
import os # To access my file directory
print(os.getcwd()) # Let's me know the Current Work Directory
fdi_data = pd.read_csv(r'Data/fdi_data.csv')
fdi_meta = pd.read_csv(r'Data/fdi_metadata.csv')
debt_data = pd.read_csv(r'Data/debt_data.csv')
debt_meta = pd.read_csv(r'Data/debt_metadata.csv')
gdp_percap_data = pd.read_csv(r'Data/gdp_percap_data.csv', header=2)
gdp_percap_meta = pd.read_csv(r'Data/gdp_percap_metadata.csv')
gov_exp_data = pd.read_csv(r'Data/gov_exp_data.csv', header=2)
gov_exp_meta = pd.read_csv(r'Data/gov_exp_metadata.csv')
pop_data = pd.read_csv(r'Data/pop_data.csv', header=2)
pop_meta = pd.read_csv(r'Data/pop_metadata.csv')
"""
'wb' stands for World Bank
"""
def wb_merge_data(data, metadata):
merge = pd.merge(
data,
metadata,
on = 'Country Code',
how = 'inner'
)
return merge
fdi_merge = wb_merge_data(fdi_data, fdi_meta)
debt_merge = wb_merge_data(debt_data, debt_meta)
gdp_percap_merge = wb_merge_data(gdp_percap_data, gdp_percap_meta)
gov_exp_merge = wb_merge_data(gov_exp_data, gov_exp_meta)
pop_merge = wb_merge_data(pop_data, pop_meta)
def wb_drop_data(data):
drop = data.drop(['Country Code','Indicator Name','Indicator Code','TableName','SpecialNotes','Unnamed: 5'], axis=1)
return drop
fdi_merge = wb_drop_data(fdi_merge)
debt_merge = wb_drop_data(debt_merge)
gdp_percap_merge = wb_drop_data(gdp_percap_merge)
gov_exp_merge = wb_drop_data(gov_exp_merge)
pop_merge = wb_drop_data(pop_merge)
def wb_mr_data(data, value_name):
data = data.melt(['Country Name','Region','IncomeGroup']).reset_index()
data = data.rename(columns={'variable': 'Year', 'value': value_name})
data = data.drop('index', axis = 1)
return data
fdi_merge = wb_mr_data(fdi_merge, 'FDI')
debt_merge = wb_mr_data(debt_merge, 'Debt')
gdp_percap_merge = wb_mr_data(gdp_percap_merge, 'GDP per Cap')
gov_exp_merge = wb_mr_data(gov_exp_merge, 'Gov Expend.')
pop_merge = wb_mr_data(pop_merge, 'Population')
def avg_groupby(data, col_cal, cn=False, ig=False, rg=False):
if cn == True:
return data.groupby('Country Name')[col_cal].mean().reset_index()
elif ig == True:
return data.groupby('IncomeGroup')[col_cal].mean().reset_index()
elif rg == True:
return data.groupby('Region')[col_cal].mean().reset_index()
"""
avg_cn_... For country
avg_ig_... Income Group
avg_rg_... Region
"""
avg_cn_fdi = avg_groupby(fdi_merge, 'FDI', cn=True)
avg_ig_fdi = avg_groupby(fdi_merge, 'FDI', ig=True)
avg_rg_fdi = avg_groupby(fdi_merge, 'FDI', rg=True)
avg_cn_debt = avg_groupby(debt_merge, 'Debt', cn=True)
avg_ig_debt = avg_groupby(debt_merge, 'Debt', ig=True)
avg_rg_debt = avg_groupby(debt_merge, 'Debt', rg=True)
avg_cn_gdp_percap = avg_groupby(gdp_percap_merge, 'GDP per Cap', cn=True)
avg_ig_gdp_percap = avg_groupby(gdp_percap_merge, 'GDP per Cap', ig=True)
avg_rg_gdp_percap = avg_groupby(gdp_percap_merge, 'GDP per Cap', rg=True)
avg_cn_gexp = avg_groupby(gov_exp_merge, 'Gov Expend.', cn=True)
avg_ig_gexp = avg_groupby(gov_exp_merge, 'Gov Expend.', ig=True)
avg_rg_gexp = avg_groupby(gov_exp_merge, 'Gov Expend.', rg=True)
avg_cn_pop = avg_groupby(pop_merge, 'Population', cn=True)
avg_ig_pop = avg_groupby(pop_merge, 'Population', ig=True)
avg_rg_pop = avg_groupby(pop_merge, 'Population', rg=True)
In Python, every file is a module. So if you want to re-use your code, you can simple import this module. For example,
# main.py
import clean_data
print(clean_data.avg_cn_fdi)
Maybe you needn't create a class for this
You can import the whole python file like you'd import any other locally created files and have access to the DataFrames in them. Here's an example:
I created a file called temporary.py:
import pandas as pd
data = pd.read_csv("temp.csv")
And then in a separate file I was able to use data like so:
import temporary
print(temporary.data)
Or, you could also do:
from temporary import data
print(data)
All that being said, I don't believe that this would be the best way to handle your data.

PandasError: DataFrame constructor not properly called! for DataFrame reader class

So I have been writing this class that will ingest two CSV files and write them to the same out path. Every time I try to run the class I get
PandasError: DataFrame constructor not properly called!
I want to know why this is happening and how to fix it.
Here is my class:
import pandas as pd
class twoCSVCombiner:
def _init_(self,fileOne,fileTwo,outPath):
self.fileOne = fileOne
self.fileTwo = fileTwo
self.outPath = outPath
def reader(self,fileOne,fileTwo):
fileOneDataframe = pd.DataFrame(fileOne)
fileTwoDataFrame = pd.DataFrame(fileTwo)
def writer(self,outPath):
self.outPath = open(self.outPath,'wb')
fileOneOut = fileOneDataframe.to_csv(self.outPath,sep=',',header=True)
fileTwoOut = fileTwoDataFrame.to_csv(self.outPath,sep=',',header=True)
Have you tried to use pd.DataFrame.from_csv() instead of just pd.DataFrame()?
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.from_csv.html

Categories

Resources