Adding records into a dataframe using function - python

I want to adding new records into alldata file by concat newdata to it. Somehow the alldata get wiped out after each cycle. Can you please help to fix?
import schedule;import sched;import pandas as pd;
alldata=pd.DataFrame()
def myfunc(outside_file):
newdata = pd.DataFrame({'id':['A', 'c'],
'value': [20, 0]},
)
print(newdata)
outside_file =pd.concat([outside_file,newdata])
print(outside_file)
schedule.every().minute.at(':00').do(myfunc, alldata)
while True:
schedule.run_pending()
print(alldata)
time.sleep(30)

You're not modifying alldata: you're passing it as argument to a function, and what's being modified is the argument inside this function, not the global variable. Compare the following:
Snippet 1:
import pandas as pd
alldata = pd.DataFrame()
def run(outside_df):
df = pd.DataFrame({'id':['A', 'B'], 'value': [20, 0]})
outside_df = pd.concat([outside_df, df])
for _ in range(5):
run(alldata)
print(alldata)
# Empty DataFrame
# Columns: []
# Index: []
Snippet 2:
import pandas as pd
alldata = pd.DataFrame()
def run():
global alldata
df = pd.DataFrame({'id':['A', 'B'], 'value': [20, 0]})
alldata = pd.concat([alldata, df])
for _ in range(5):
run()
print(alldata)
# id value
# 0 A 20
# 1 B 0
# 0 A 20
# 1 B 0
# 0 A 20
# 1 B 0
# 0 A 20
# 1 B 0
# 0 A 20
# 1 B 0
Moral of the story: use global (or don't) when you want to modify a global variable inside a function.

Related

How to transpose values from top few rows in python dataframe into new columns

I am trying to select the values from the top 3 records of each group in a python sorted dataframe and put them into new columns. I have a function that is processing each group but I am having difficulties finding the right method to extract, rename the series, then combine the result as a single series to return.
Below is a simplified example of an input dataframe (df_in) and the expected output (df_out):
import pandas as pd
data_in = { 'Product': ['A', 'A', 'A', 'A', 'B', 'C', 'C'],
'Price': [25.0, 30.5, 50.0, 61.5, 120.0, 650.0, 680.0],
'Qty': [15 , 13, 14, 10, 5, 2, 1]}
df_in = pd.DataFrame (data_in, columns = ['Product', 'Price', 'Qty'])
I am reproducing below 2 examples of the functions I've tested and trying to get a more efficient option that works, especially if I have to process many more columns and records.
Function best3_prices_v1 works but have to explicitly specify each column or variable, and is especially an issue as I have to add more columns.
def best3_prices_v1(x):
d = {}
# get best 3 records if records available, else set volumes as zeroes
best_price_lv1 = x.iloc[0].copy()
rec_with_zeroes = best_price_lv1.copy()
rec_with_zeroes['Price'] = 0
rec_with_zeroes['Qty'] = 0
recs = len(x) # number of records
if (recs == 1):
# 2nd and 3rd records not available
best_price_lv2 = rec_with_zeroes.copy()
best_price_lv3 = rec_with_zeroes.copy()
elif (recs == 2):
best_price_lv2 = x.iloc[1]
# 3rd record not available
best_price_lv3 = rec_with_zeroes.copy()
else:
best_price_lv2 = x.iloc[1]
best_price_lv3 = x.iloc[2]
# 1st best
d['Price_1'] = best_price_lv1['Price']
d['Qty_1'] = best_price_lv1['Qty']
# 2nd best
d['Price_2'] = best_price_lv2['Price']
d['Qty_2'] = best_price_lv2['Qty']
# 3rd best
d['Price_3'] = best_price_lv3['Price']
d['Qty_3'] = best_price_lv3['Qty']
# return combined results as a series
return pd.Series(d, index=['Price_1', 'Qty_1', 'Price_2', 'Qty_2', 'Price_3', 'Qty_3'])
Codes to call function:
# sort dataframe by Product and Price
df_in.sort_values(by=['Product', 'Price'], ascending=True, inplace=True)
# get best 3 prices and qty as new columns
df_out = df_in.groupby(['Product']).apply(best3_prices_v1).reset_index()
Second attempt to improve/reduce codes and explicit names for each variable ... not complete and not working.
def best3_prices_v2(x):
d = {}
# get best 3 records if records available, else set volumes as zeroes
best_price_lv1 = x.iloc[0].copy()
rec_with_zeroes = best_price_lv1.copy()
rec_with_zeroes['Price'] = 0
rec_with_zeroes['Qty'] = 0
recs = len(x) # number of records
if (recs == 1):
# 2nd and 3rd records not available
best_price_lv2 = rec_with_zeroes.copy()
best_price_lv3 = rec_with_zeroes.copy()
elif (recs == 2):
best_price_lv2 = x.iloc[1]
# 3rd record not available
best_price_lv3 = rec_with_zeroes.copy()
else:
best_price_lv2 = x.iloc[1]
best_price_lv3 = x.iloc[2]
stats_columns = ['Price', 'Qty']
# get records values for best 3 prices
d_lv1 = best_price_lv1[stats_columns]
d_lv2 = best_price_lv2[stats_columns]
d_lv3 = best_price_lv3[stats_columns]
# How to rename (keys?) or combine values to return?
lv1_stats_columns = [c + '_1' for c in stats_columns]
lv2_stats_columns = [c + '_2' for c in stats_columns]
lv3_stats_columns = [c + '_3' for c in stats_columns]
# return combined results as a series
return pd.Series(d, index=lv1_stats_columns + lv2_stats_columns + lv3_stats_columns)
Let's unstack():
df_in=(df_in.set_index([df_in.groupby('Product').cumcount().add(1),'Product'])
.unstack(0,fill_value=0))
df_in.columns=[f"{x}_{y}" for x,y in df_in]
df_in=df_in.reset_index()
OR via pivot()
df_in=(df_in.assign(key=df_in.groupby('Product').cumcount().add(1))
.pivot('Product','key',['Price','Qty'])
.fillna(0,downcast='infer'))
df_in.columns=[f"{x}_{y}" for x,y in df_in]
df_in=df_in.reset_index()
Based on #AnuragDabas's pivot solution and #ceruler's feedback above, I can now expand it to a more general problem.
New dataframe with more groups and columns:
data_in = { 'Product': ['A', 'A', 'A', 'A', 'B', 'C', 'C'],
'Model': ['A1', 'A1', 'A1', 'A2', 'B1', 'C1', 'C1'],
'Price': [25.0, 30.5, 50.0, 61.5, 120.0, 650.0, 680.0],
'Qty': [15 , 13, 14, 10, 5, 2, 1],
'Ratings': [9, 7, 8, 10, 6, 7, 8 ]}
df_in = pd.DataFrame (data_in, columns = ['Product', 'Model' ,'Price', 'Qty', 'Ratings'])
group_list = ['Product', 'Model']
stats_list = ['Price','Qty', 'Ratings']
df_out = df_in.groupby(group_list).head(3)
df_out=(df_out.assign(key=df_out.groupby(group_list).cumcount().add(1))
.pivot(group_list,'key', stats_list)
.fillna(0,downcast='infer'))
df_out.columns=[f"{x}_{y}" for x,y in df_out]
df_out = df_out.reset_index()

User input to create a column in Pandas DataFrame

I have a pandas DataFrame:
sample_data = {'Sample': ['A', 'B', 'A', 'B'],
'Surface': ['Top', 'Bottom', 'Top', 'Bottom'],
'Intensity' : [21, 32, 14, 45]}
sample_dataframe = pd.DataFrame(data=sample_data)
And I have a function to get user input to create a column with a 'Condition' for each 'Sample'
def get_choice(df, column):
#df['Condition'] = user_input
user_input = []
for i in df[column]:
print('\n', i)
user_input.append(input('Condition= '))
df['Condition'] = user_input
return df
get_choice(group_fname, 'Sample')
This works, however the the user is prompted for each row that a 'Sample' exists. It is not a problem in this example where the Samples have two rows each, but when the DataFrame is larger and there are multiple samples that occupy multiple rows then it gets tedious.
How do I create a function that will fill the 'Condition' column for each row that a 'Sample' occupies by just getting the input once.
I tried creating the function to return a dictionary then .apply() that to the DataFrame, but when I do that it still asks for input for each instance of the 'Sample'.
If I understand your question right, you want to get user input only once for each unique value and then create column 'Condition':
sample_data = {'Sample': ['A', 'B', 'A', 'B'],
'Surface': ['Top', 'Bottom', 'Top', 'Bottom'],
'Intensity' : [21, 32, 14, 45]}
sample_dataframe = pd.DataFrame(data=sample_data)
def get_choice(df, column):
m = {}
for v in df[column].unique():
m[v] = input('Condition for [{}] = '.format(v))
df['Condition'] = df[column].map(m)
return df
print( get_choice(sample_dataframe, 'Sample') )
Prints (for example)
Condition for [A] = 1
Condition for [B] = 2
Sample Surface Intensity Condition
0 A Top 21 1
1 B Bottom 32 2
2 A Top 14 1
3 B Bottom 45 2

Google cloud NL API data to Pandas Dataframe

I‘m using Google NL API (sample_classify_text)
It's sending me data that I transformed into this format:
response_list = [[['a', 'b', 'c'], [1,2,3], ['url1']], [['d'], [4], ['url2']]]
From here I'd like to build a Pandas df that looks like this:
a b c 1 2 3 url1
d 4 url2
Knowing that the number of results for each url is different (a,b,c = 3 results, d = 1 result) It seems that most of the time number of results < 4 but I'm not sure about this, so I'd like to keep it flexible.
I've tried a few things, but it gets pretty complicated. I'm wondering if there's an easy way to handle that?
Have you tried creating a Pandas DF directly from the list?
Such like:
import pandas as pd
response_list = [[['a', 'b', 'c'], [1,2,3], ['url1']], [['d'], [4], ['url2']]]
df = pd.DataFrame(response_list)
The result of the print(df) is:
0 1 2
0 [a, b, c] [1, 2, 3] [url1]
1 [d] [4] [url2]
That's what I ended up doing.
Not the most elegant solution...
Please don't tell me this can be done with a one-liner :D
import pandas as pd
response_list = [[['a', 'b', 'c'], [1,2,3], ['url1']], [['d'], [4], ['url2']]]
colum_0, colum_1, colum_2, colum_3, colum_4, colum_5, colum_6 = [None],[None],[None],[None],[None],[None],[None] #pour crer les colonnes
for main_list in response_list:
for idx_macro, sub_list in enumerate(main_list):
for idx, elem in enumerate(sub_list):
if idx_macro == 0:
if idx == 0:
colum_0.append(elem)
if idx == 1:
colum_1.append(elem)
if idx == 2:
colum_2.append(elem)
elif idx_macro == 1:
if idx == 0:
colum_3.append(elem)
if idx == 1:
colum_4.append(elem)
if idx == 2:
colum_5.append(elem)
elif idx_macro == 2:
colum_6.append(elem)
colum_lists = [colum_0, colum_1, colum_2, colum_3, colum_4, colum_5, colum_6]
longest_list = 3
colum_lists2 = []
for lst in colum_lists[:-1]: #skip urls
while len(lst) < longest_list:
lst.append(None)
colum_lists2.append(lst)
colum_lists2.append(colum_6) #add urls
df = pd.DataFrame(colum_lists2)
df = df.transpose()
df = df.drop(0)
display(df)

Joining dataframe in for loop

My code won't work... it gives me ValueError: columns overlap but no suffix specified
import pandas as pd
import pickle
list = ["ZILLOW2.csv", "ZILLOW3.csv", "ZILLOW4.csv",
"ZILLOW6.csv", "ZILLOW7.csv", "ZILLOW8.csv"]
maindf = pd.DataFrame()
for x in list:
df = pd.read_csv(x)
if x == "ZILLOW2.csv":
maindf = pd.DataFrame(df)
else:
maindf = maindf.join(df)
print(maindf)
Use concat:
import pandas as pd
import pickle
list = ["ZILLOW2.csv", "ZILLOW3.csv", "ZILLOW4.csv",
"ZILLOW6.csv", "ZILLOW7.csv", "ZILLOW8.csv"]
maindf = pd.DataFrame()
for x in list:
df = pd.read_csv(x)
if x == "ZILLOW2.csv":
maindf = pd.DataFrame(df)
else:
maindf = pd.concat([maindf, df], axis=1)
print(maindf)

Pandas integrate over columns per each row

In a simplified dataframe:
import pandas as pd
df1 = pd.DataFrame({'350': [7.898167, 6.912074, 6.049002, 5.000357, 4.072320],
'351': [8.094912, 7.090584, 6.221289, 5.154516, 4.211746],
'352': [8.291657, 7.269095, 6.393576, 5.308674, 4.351173],
'353': [8.421007, 7.374317, 6.496641, 5.403691, 4.439815],
'354': [8.535562, 7.463452, 6.584512, 5.485725, 4.517310],
'355': [8.650118, 7.552586, 6.672383, 4.517310, 4.594806]},
index=[1, 2, 3, 4, 5])
int_range = df1.columns.astype(float)
a = 0.005
b = 0.837
I would like to solve an equation which is attached as an image below:
I is equal to the values in the data frame. x is the int_range values so in this case from 350 to 355 with a dx=1.
a and b are optional constants
I need to get a dataframe as an output per each row
For now I do something like this, but I'm not sure it's correct:
dict_INT = {}
for index, row in df1.iterrows():
func = df1.loc[index]*df1.loc[index].index.astype('float')
x = df1.loc[index].index.astype('float')
dict_INT[index] = integrate.trapz(func, x)
df_out = pd.DataFrame(dict_INT, index=['INT']).T
df_fin = df_out/(a*b)
This is the final sum I get per row:
1 3.505796e+06
2 3.068796e+06
3 2.700446e+06
4 2.199336e+06
5 1.840992e+06
I solved this by first converting the dataframe to dict and then performing your equation by each item in row, then writing these value to dict using collections defaultdict. I will break it down:
import pandas as pd
from collections import defaultdict
df1 = pd.DataFrame({'350': [7.898167, 6.912074, 6.049002, 5.000357, 4.072320],
'351': [8.094912, 7.090584, 6.221289, 5.154516, 4.211746],
'352': [8.291657, 7.269095, 6.393576, 5.308674, 4.351173],
'353': [8.421007, 7.374317, 6.496641, 5.403691, 4.439815],
'354': [8.535562, 7.463452, 6.584512, 5.485725, 4.517310],
'355': [8.650118, 7.552586, 6.672383, 4.517310, 4.594806]},
index=[1, 2, 3, 4, 5]
)
int_range = df1.columns.astype(float)
a = 0.005
b = 0.837
dx = 1
df_dict = df1.to_dict() # convert df to dict for easier operations
integrated_dict = {} # initialize empty dict
d = defaultdict(list) # initialize empty dict of lists for tuples later
integrated_list = []
for k,v in df_dict.items(): # unpack df dict of dicts
for x,y in v.items(): # unpack dicts by column and index (x is index, y is column)
integrated_list.append((k, (((float(k)*float(y)*float(dx))/(a*b))))) #store a list of tuples.
for x,y in integrated_list: # create dict with column header as key and new integrated calc as value (currently a tuple)
d[x].append(y)
d = {k:tuple(v) for k, v in d.items()} # unpack to multiple values
integrated_df = pd.DataFrame.from_dict(d) # to df
integrated_df['Sum'] = integrated_df.iloc[:, :].sum(axis=1)
output (updated to include sum):
350 351 352 353 354 \
0 660539.653524 678928.103226 697410.576822 710302.382557 722004.527599
1 578070.704898 594694.141935 611402.972521 622015.269056 631317.086738
2 505890.250896 521785.529032 537763.142652 547984.294624 556969.473835
3 418189.952210 432314.245161 446512.126165 455795.202628 464025.483871
4 340576.344086 353243.212903 365976.797133 374493.356033 382109.376344
355 Sum
0 733761.502987 4.202947e+06
1 640661.416965 3.678162e+06
2 565996.646356 3.236389e+06
3 383188.781362 2.600026e+06
4 389762.516129 2.206162e+06

Categories

Resources