Jupyter Dropdown ipywidget displays wrong dataframe - python

I am well and truly baffled. First time using the dropdown widget so forgive me if this is obvious and thank you for any help you can provide.
Here is the dataframe I want to display and how it was built:
def top_10_venues(data) :
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
try:
columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
except:
columns.append('{}th Most Common Venue'.format(ind+1))
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = data['Neighborhood']
for ind in np.arange(denver_grouped.shape[0]):
neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(data.iloc[ind, :], num_top_venues)
neighborhoods_venues_sorted = neighborhoods_venues_sorted.set_index(['Neighborhood'])
top_10_venues(denver_grouped)
neighborhoods_venues_sorted
Here is my dropdown widget:
#Experimenting with Jupyter dropdown
filtered_df = None
dropdown = widgets.SelectMultiple(
options=neighborhoods_venues_sorted.index,
description='Venue',
disabled=False,
layout={'height':'100px', 'width':'40%'})
def max_density(widget):
global filtered_df
selection = list(widget['new'])
with out:
clear_output()
display(neighborhoods_venues_sorted[selection])
filtered_df = neighborhoods_venues_sorted[selection]
out = widgets.Output()
dropdown.observe(filter_dataframe, names='value')
display(dropdown)
display(out)
Here is what I end up seeing, the unformatted dataframe I ran the function on?

Booyah, figured it out!
Seems my issue was a misunderstanding of what was happening within the cell that created neighborhoods_venues_sorted. I thought I was creating a dataframe. Instead I created a function
First is the sort function
def return_most_common_venues(row, num_top_venues):
row_categories = row.iloc[1:]
row_categories_sorted = row_categories.sort_values(ascending=False)
return row_categories_sorted.index.values[0:num_top_venues]
This is the new function instead of a block of code in a cell
#Function to create sorted data frame with top 10 most common venues
def top_ten_venues(df) :
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
try:
columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
except:
columns.append('{}th Most Common Venue'.format(ind+1))
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = df['Neighborhood']
for ind in np.arange(denver_grouped.shape[0]):
neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(df.iloc[ind, :], num_top_venues)
#important to have a return in a function, this is the output that can be attached to a variable
return neighborhoods_venues_sorted
Next I ran it on my targeted dataframe and assigned it to a variable. This fixed my issue, I'm still too new to understand fully why when this exact same code was run in a cell it refused to assign it as a new dataframe.
#creating a variable to hold the df for later access
neighborhoods_venues_sorted = top_ten_venues(denver_grouped)
#reindexing because it's fun
neighborhoods_venues_sorted = neighborhoods_venues_sorted.set_index(['Neighborhood'])

Related

Save values in a dataframe and replace them in a csv

so im not quite sure how to formulate the question, as im quite new in pythong and coding in general.
I have a GUI that displays already available information form a csv:
def updatetext(self):
"""adds information extracted from database already provided"""
df_subj = Content.extract_saved_data(self.date)
self.lineEditFirstDiagnosed.setText(str(df_subj["First_Diagnosed_preop"][0])) \
if str(df_subj["First_Diagnosed_preop"][0]) != 'nan' else self.lineEditFirstDiagnosed.setText('')
self.lineEditAdmNeurIndCheck.setText(str(df_subj['Admission_preop'][0])) \
works great
now, if i chenge values in the GUI, i want them to be updated in the csv.
I started like this:
def onClickedSaveReturn(self):
"""closes GUI and returns to calling (main) GUI"""
df_general = Clean.get_GeneralData()
df_subj = {k: '' for k in Content.extract_saved_data(self.date).keys()} # extract empty dictionary
df_subj['ID'] = General.read_current_subj().id[0]
df_subj['PID'] = df_general['PID_ORBIS'][0]
df_subj['Gender'] = df_general['Gender'][0]
df_subj['Diagnosis_preop'] = df_general['diagnosis'][0]
df_subj["First_Diagnosed_preop"] = self.lineEditFirstDiagnosed.text()
df_subj['Admission_preop'] = self.lineEditAdmNeurIndCheck.text()
df_subj['Dismissal_preop'] = self.DismNeurIndCheckLabel.text()
and this is what my boss added now:
subj_id = General.read_current_subj().id[0] # reads data from curent_subj (saved in ./tmp)
df = General.import_dataframe('{}.csv'.format(self.date), separator_csv=',')
if df.shape[1] == 1:
df = General.import_dataframe('{}.csv'.format(self.date), separator_csv=';')
idx2replace = df.index[df['ID'] == subj_id][0]
# TODO: you need to find a way to turn the dictionaryy df_subj into a dataframe and replace the data at
# the index idxreplace of 'df' with df_subj. Later I would suggest to use line 322 to save everything to the
# file
df.iloc[idx2replace] = pds.DataFrame([df_subj])
df.to_csv("preoperative.csv", index=False)
# df.to_csv(os.path.join(FILEDIR, "preoperative.csv"), index=False)
self.close()
I'm not really sure how to approach this, or to be honest, what to do at all.
Hope someone can help me.
Thank youu
You should load the file only once and keep the DF (self.df or something). Then display it and every time the user changes a value in the GUI the DF should update and when the user clicks save you should just overwrite the existing file with the current DF in memory.

How do I loop column names in a pandas dataframe?

I am new to Python and have never really used Pandas, so forgive me if this doesn't make sense. I am trying to create a df based on frontend data I am sending to a flask route. The data is looped through and appended for each row. My only problem is that I don't know how to get the df columns to reflect that. Here is my code to build the rows and the current output:
claims = csv_data["claims"]
setups = csv_data["setups"]
for setup in setups:
setup = setups[0]
offerings = setup["currentOfferings"]
considered = setup["considerationSet"]
reach_dict = setup["reach"]
favorite_dict = setup["favorite"]
summary_dict = setup["summaryMetrics"]
rows = []
for i, claim in enumerate(claims):
row = []
row.append(i + 1)
row.append(claim)
for setup in setups:
setup = setups[0]
row.append("X") if claim in setup["currentOfferings"] else row.append(float('nan'))
row.append("X") if claim in setup["considerationSet"] else row.append(float('nan'))
if claim in setup["currentOfferings"]:
reach_score = reach_dict[claim]
reach_percentage = "{:.0%}".format(reach_score)
row.append(reach_percentage)
else:
row.append(float('nan'))
if claim in setup["currentOfferings"]:
favorite_score = favorite_dict[claim]
fav_percentage = "{:.0%}".format(favorite_score)
row.append(fav_percentage)
else:
row.append(float('nan'))
rows.append(row)
I know that I can put columns = ["#", "Claims", "Setups", etc...] in the df, but that doesn't work because the rows are looping through multiple setups, and the number of setups can change. If I don't specify the column names (how it is in the image), then I just have numbers as columns names. Ideally it should loop through the data it receives in the route, and would start with "#" "Claims" as columns, and then for each setup "Setup 1", "Consideration Set 1", "Reach", "Favorite", "Setup 2", "Consideration Set 2", and so on... etc.
I tried to create a similar type of loop for the columns:
my_columns = []
for i, row in enumerate(rows):
col = []
if row[0] != None:
col.append("#")
else:
pass
if row[1] != None:
col.append("Claims")
else:
pass
if row[2] != None:
col.append("Setup")
else:
pass
if row[3] != None:
col.append("Consideration Set")
else:
pass
if row[4] != None:
col.append("Reach")
else:
pass
if row[5] != None:
col.append("Favorite")
else:
pass
my_columns.append(col)
df = pd.DataFrame(
rows,
columns = my_columns
)
But this didn't work because I have the same issue of no loop, I have 6 columns passed and 10 data columns passed. I'm not sure if I am just not doing the loop of the columns properly, or if I am making everything more complicated than it needs to be.
This is what I am trying to accomplish without having to explicitly name the columns because this is just sample data. There could end up being 3, 4, however many setups in the actual app.
what I would like the ouput to look like
I don't know if this is the most efficient way of doing something like this but I think this is what you want to achieve.
def create_columns(df):
new_cols=[]
for i in range(len(df.columns)):
repeated_cols = 6 #here is the number of columns you need to repeat for every setup
idx = 1 + i // repeated_cols
basic = ['#', 'Claims', f'Setup_{idx}', f'Consideration_Set_{idx}', 'Reach', 'Favorite']
new_cols.append(basic[i % len(basic)])
return new_cols
df.columns = create_columns(df)
If your data comes as csv then try pd.read_csv() to create dataframe.

How do you replace the index column in Bokeh DataTable?

I created a DataTable in Bokeh, but it doesn't show the index column:
I expected to have the "Index" column in left:
This is my code:
evolution_data = treatcriteria_daily_data_table.groupby(['startdate_dayweek','startdate_weekyear'],as_index = False).sum().pivot('startdate_dayweek','startdate_weekyear').fillna(0)
evolution_data = evolution_data.droplevel(0,1)
evolution_data.loc['Total'] = evolution_data.sum()
evolution_data['Index'] = ['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche', 'Total']
evolution_data.set_index('Index', inplace=True, drop=True)
# remove the last week if there is not all the data
evolution_data = evolution_data.loc[:, ~(evolution_data == 0.).any()]
evolution_two_last_weeks = []
nb_cols = len(evolution_data.columns)
diff_cpu_2_last_weeks = evolution_data.iat[7, nb_cols - 1] - evolution_data.iat[7, nb_cols - 2]
for i in range(0,7):
daily_evolution = (data_2_weeks_before_the_last.iat[1,i] - data_2_weeks_before_the_last.iat[0,i]) / diff_cpu_2_last_weeks
evolution_two_last_weeks.append(daily_evolution)
variation_of_total = (data_2_weeks_before_the_last.iat[1,7] - data_2_weeks_before_the_last.iat[0,7]) / data_2_weeks_before_the_last.iat[0,7]
daily_variations_dict = {"Lundi": evolution_two_last_weeks[0],
"Mardi": evolution_two_last_weeks[1],
"Mercredi": evolution_two_last_weeks[2],
"Jeudi": evolution_two_last_weeks[3],
"Vendredi": evolution_two_last_weeks[4],
"Samedi": evolution_two_last_weeks[5],
"Dimanche": evolution_two_last_weeks[6],
"Total": variation_of_total}
# remove decimals in the table
cols = evolution_data.columns
evolution_data[cols] = evolution_data[cols].applymap(np.int64)
evolution_data['% évolution des 2 dernières semaines'] = evolution_data.index.map(mapper=(lambda x: daily_variations_dict[x]))
evolution_data['% évolution des 2 dernières semaines'] = pd.Series(["{0:.2f}%".format(val*100) for val in evolution_data['% évolution des 2 dernières semaines']], index = evolution_data.index)
print(evolution_data)
Columns = [TableColumn(field=Ci, title=Ci) for Ci in evolution_data.columns] # bokeh columns
data_table = DataTable(columns=Columns, source=ColumnDataSource(evolution_data)) # bokeh table
show(data_table)
How to show the index column (with day names) and not the row number? Thank you.
As already said in comments Bokeh doesn't support altering the first column which is always fixed and indicates the 0-based row number. However, under the hood, Bokeh uses the SlickGrid which does allows this functionality, but this solution is too complicated as you would need to find the reference to the SlickGrid object first in the Bokeh model and then replace the first column in JavaScript right after the page load.
Much more simple way is as suggested to hide the first column using index_position = None so you could do data_table = DataTable(... , index_position = None) and then add the Index data from your DataFrame as first one to the table's columns. The reason that it is not there now is that the df.columns doesn't include the index column that you need. So try:
cols.insert(0, 'Index')
data_table = DataTable(columns=Columns,
source=ColumnDataSource(evolution_data),
index_position = None) # add this line

Rename a data frame name by adding the iteration value as suffix in a for loop (Python)

I have run the following Python code :
array = ['AEM000', 'AID017']
USA_DATA_1D = USA_DATA10.loc[USA_DATA10['JOBSPECIALTYCODE'].isin(array)]
I run a regression model and extract the log-likelyhood value on each item of this array by a for loop :
for item in array:
USA_DATA_1D = USA_DATA10.loc[USA_DATA10['JOBSPECIALTYCODE'] == item]
formula = "WEIGHTED_BASE_MEDIAN_FINAL_MEAN ~ YEAR"
response, predictors = dmatrices(formula, USA_DATA_1D, return_type='dataframe')
mod1 = sm.GLM(response, predictors, family=sm.genmod.families.family.Gaussian()).fit()
LLF_NG = {'model': ['Standard Gaussian'],
'llf_value': mod1.llf
}
df_llf = pd.DataFrame(LLF_NG , columns = ['model', 'llf_value'])
Now I would like to remane the dataframe df_llf by df_llf_(name of the item) i.e. df_llf_AEM000 when running the loop on the first item and df_llf_AID017 when running the loop on the second one.
I need some help to know how to proceed that.
If you want to rename the data frame, you need to use the copy method so that the original data frame does not get altered.
df_llf_AEM000 = df_llf.copy()
If you want to save iteratively several different versions of the original data frame, you can do something like this:
allDataframes = []
for i in range(10):
df = df_original.copy()
allDataframes.append(df)
print(allDataframes[0])

Script keep showing "SettingCopyWithWarning'

Hello my problem is that my script keep showing below message
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
downcast=downcast
I Searched the google for a while regarding this, and it seems like my code is somehow
assigning sliced dataframe to new variable, which is problematic.
The problem is ** I can't find where my code get problematic **
I tried copy function, or seperated the nested functions, but it is not working
I attached my code below.
def case_sorting(file_get, col_get, methods_get, operator_get, value_get):
ops = {">": gt, "<": lt}
col_get = str(col_get)
value_get = int(value_get)
if methods_get is "|x|":
new_file = file_get[ops[operator_get](file_get[col_get], value_get)]
else:
new_file = file_get[ops[operator_get](file_get[col_get], np.percentile(file_get[col_get], value_get))]
return new_file
Basically what i was about to do was to make flask api that gets excel file as an input, and returns the csv file with some filtering. So I defined some functions first.
def get_brandlist(df_input, brand_input):
if brand_input == "default":
final_list = (pd.unique(df_input["브랜드"])).tolist()
else:
final_list = brand_input.split("/")
if '브랜드' in final_list:
final_list.remove('브랜드')
final_list = [x for x in final_list if str(x) != 'nan']
return final_list
Then I defined the main function
def select_bestitem(df_data, brand_name, col_name, methods, operator, value):
# // 2-1 // to remove unnecessary rows and columns with na values
df_data = df_data.dropna(axis=0 & 1, how='all')
df_data.fillna(method='pad', inplace=True)
# // 2-2 // iterate over all rows to find which row contains brand value
default_number = 0
for row in df_data.itertuples():
if '브랜드' in row:
df_data.columns = df_data.iloc[default_number, :]
break
else:
default_number = default_number + 1
# // 2-3 // create the list contains all the target brand names
brand_list = get_brandlist(df_input=df_data, brand_input=brand_name)
# // 2-4 // subset the target brand into another dataframe
df_data_refined = df_data[df_data.iloc[:, 1].isin(brand_list)]
# // 2-5 // split the dataframe based on the "brand name", and apply the input condition
df_per_brand = {}
df_per_brand_modified = {}
for brand_each in brand_list:
df_per_brand[brand_each] = df_data_refined[df_data_refined['브랜드'] == brand_each]
file = df_per_brand[brand_each].copy()
df_per_brand_modified[brand_each] = case_sorting(file_get=file, col_get=col_name, methods_get=methods,
operator_get=operator, value_get=value)
# // 2-6 // merge all the remaining dataframe
df_merged = pd.DataFrame()
for brand_each in brand_list:
df_merged = df_merged.append(df_per_brand_modified[brand_each], ignore_index=True)
final_df = df_merged.to_csv(index=False, sep=',', encoding='utf-8')
return final_df
And I am gonna import this function in my app.py later
I am quite new to all the coding, therefore really really sorry if my code is quite hard to understand, but I just really wanted to get rid of this annoying warning message. Thanks for help in advance :)

Categories

Resources