Create pandas DataFrames in a function - python

How can I build a function that creates these dataframes?:
buy_orders_1h = pd.DataFrame(
{'Date_buy': buy_orders_date_1h,
'Name_buy': buy_orders_name_1h
})
sell_orders_1h = pd.DataFrame(
{'Date_sell': sell_orders_date_1h,
'Name_sell': sell_orders_name_1h
})
I have 10 dataframes like this I create very manually and everytime I want to add a new column I would have to do it in all of them which is time consuming. If I can build a function I would only have to do it once.
The differences between the two above function are of course one is for buy signals the other is for sell signals.
I guess the inputs to the function should be:
_buy/_sell - for the Column name
buy_ / sell_ - for the Column input
I'm thinking input to the function could be something like:
def create_dfs(col, col_input,hour):
df = pd.DataFrame(
{'Date' + col : col_input + "_orders_date_" + hour,
'Name' + col : col_input + "_orders_name_" + hour
}
return df
buy_orders_1h = create_dfs("_buy", "buy_", "1h")
sell_orders_1h = create_dfs("_sell", "sell_", "1h")

A dataframe needs an index, so either you can manually pass an index, or enter your row values in list form:
def create_dfs(col, col_input, hour):
df = pd.DataFrame(
{'Date' + col: [col_input + "_orders_date_" + hour],
'Name' + col: [col_input + "_orders_name_" + hour]})
return df
buy_orders_1h = create_dfs("_buy", "buy_", "1h")
sell_orders_1h = create_dfs("_sell", "sell_", "1h")
Edit: Updated due to new information:
To call a global variable using a string, enter globals() before the string in the following manner:
'Date' + col: globals()[col_input + "_orders_date_" + hour]

Check the output please to see if this is what you want. You first create two dictionaries, then depending on the buy=True condition, it either appends to the buying_df or to the selling_df. I created two sample lists of dates and column names, and iteratively appended to the desired dataframes. After creating the dicts, then pandas.DataFrame is created. You do not need to create it iteratively, rather once in the end when your dates and names have been collected into a dict.
from collections import defaultdict
import pandas as pd
buying_df=defaultdict(list)
selling_df=defaultdict(list)
def add_column_to_df(date,name,buy=True):
if buy:
buying_df["Date_buy"].append(date)
buying_df["Name_buy"].append(name)
else:
selling_df["Date_sell"].append(date)
selling_df["Name_sell"].append(name)
dates=["1900","2000","2010"]
names=["Col_name1","Col_name2","Col_name3"]
for date, name in zip(dates,names):
add_column_to_df(date,name)
#print(buying_df)
df=pd.DataFrame(buying_df)
print(df)

Related

pandas: while loop to simultaneously advance through multiple lists and call functions

I want my code to:
read data from a CSV and make a dataframe: "source_df"
see if the dataframe contains any columns specified in a list:
"possible_columns"
call a unique function to replace the values in each column whose header is found in the "possible_columns" the list, then insert the modified values in a new dataframe: "destination_df"
Here it is:
import pandas as pd
#creates source_df
file = "yes-no-true-false.csv"
data = pd.read_csv(file)
source_df = pd.DataFrame(data)
#creates destination_df
blanklist = []
destination_df = pd.DataFrame(blanklist)
#create the column header lists for comparison in the while loop
columns = source_df.head(0)
possible_columns = ['yes/no','true/false']
#establish the functions list and define the functions to replace column values
fix_functions_list = ['yes_no_fix()','true_false_fix()']
def yes_no_fix():
destination_df['yes/no'] = destination_df['yes/no fixed'].replace("No","0").replace("Yes","1")
def true_false_fix():
destination_df['true/false'] = destination_df['true/false fixed'].replace('False', '1').replace('True', '0')
'''use the counter to call a unique function from the function list to replace the values in each column whose header is found in the "possible_columns" the list, insert the modified values in "destination_df, then advance the counter'''
counter = 0
while counter < len(possible_columns):
if possible_columns[counter] in columns:
destination_df.insert(counter, possible_columns[counter], source_df[possible_columns[counter]])
fix_functions_list[counter]
counter = counter + 1
#see if it works
print(destination_df.head(10))
When I print(destination_df), I see the unmodified column values from source_df. When I call the functions independently they work, which makes me think something is going wrong in my while loop.
Your issue is that you are trying to call a function that is stored in a list as a string.
fix_functions_list[cnt]
This will not actually run the function just access the string value.
I would try and find another way to run these functions.
def yes_no_fix():
destination_df['yes/no'] = destination_df['yes/no fixed'].replace("No","0").replace("Yes","1")
def true_false_fix():
destination_df['true/false'] = destination_df['true/false fixed'].replace('False', '1').replace('True', '0')
fix_functions_list = {0:yes_no_fix,1:true_false_fix}
and change the function calling to like below
fix_functions_list[counter]()
#creates source_df
file = "yes-no-true-false.csv"
data = pd.read_csv(file)
source_df = pd.DataFrame(data)
possible_columns = ['yes/no','true/false']
mapping_dict={'yes/no':{"No":"0","Yes":"1"} ,'true/false': {'False':'1','True': '0'}
old_columns=[if column not in possible_columns for column in source_df.columns]
existed_columns=[if column in possible_columns for column in source_df.columns]
new_df=source_df[existed_columns]
for column in new_df.columns:
new_df[column].map(mapping_dict[column])
new_df[old_columns]=source_df[old_columns]

How to apply a custom function to the same dataframe multiple times in python?

I am trying to explode columns of a pandas dataframe to make new columns.
def explode(child_df, column_value):
child_df = child_df.dropna(subset=[column_value])
if isinstance(child_df[column_value].iloc[0], str):
print('tried')
child_df[column_value] = child_df[column_value].apply(ast.literal_eval)
expanded_child_df = (pd.concat({i: json_normalize(x) for i, x in child_df.pop(column_value).items()}).reset_index(level=1, drop=True).join(child_df,how='right',lsuffix='_left',rsuffix='_right').reset_index(drop=True))
expanded_child_df.columns = map(str.lower, expanded_child_df.columns)
return expanded_child_df
Is there a way to apply the explode function to a dataframe multiple times,
this is where i'm tryin to apply explode function to the dataframe consolidated_df:
def clean():
column_value = ['tracking_results','trackable_items','events']
consolidated_df_cleaner = explode(consolidated_df,column_value.value)
# Need to iterate over column_value and pass the value as the second argument into `explode` function on the same dataframe
consolidated_df_cleaner.to_csv('/home/response4.csv',index=False)
tried this but wont work :
pd_list = []
for param in column_value:
pd_list.append(apply(explode(consolidated_df),param))
this is what i'm doing right now and i need to avoid this :
consolidated_df_cleaner=explode(consolidated_df,'tracking_results')
consolidated_df_cleaner2=explode(consolidated_df_cleaner,'trackable_items')
consolidated_df_cleaner3= explode(consolidated_df_cleaner2,'events')
consolidated_df_cleaner3.to_csv('/home/response4.csv',index=False)
expected output :
tracking_results trackable_items events
intransit abc 22
intransit xqy 23
Try
(consolidated_df
.pipe(explode,'tracking_results')
.pipe(explode,'trackable_items')
.pipe(explode,'events')
.to_csv('/home/response4.csv',index=False)
)

I want to run a loop with condition and save all outputs as dataframes with different names

I wrote an function which only depends on a dataframe. The functions output is also a dataframe. I would like make different dataframes according a condition and save them as different datasets with different names. However I couldnt save them as dataframes with different names. Instead i manually do the process. Is there a code which would do the same. It would be much beneficial.
import os
import numpy as np
import pandas as pd
data1 = pd.read_csv('C:/Users/Oz/Desktop/vintage/vintage1.csv', encoding='latin-1')
product_list= data1['product_types'].unique()
def vintage_table(df):
df['Disbursement_Date']=pd.to_datetime(df.Disbursement_Date)
df['Closing_Date']=pd.to_datetime(df.Closing_Date)
df['NPL_date']=pd.to_datetime(df.NPL_date, errors='ignore')
df['NPL_date_period']=df.loc[df.NPL_date > '2015-01-01', 'NPL_date'].apply(lambda x: x.strftime('%Y-%m'))
df['Dis_date_period'] = df.Disbursement_Date.apply(lambda x: x.strftime('%Y-%m'))
df['diff']=((df.NPL_date-df.Disbursement_Date) / np.timedelta64(3, 'M')).round(0)
df=df.groupby(['Dis_date_period','NPL_date_period']).agg({'Dis_amount' : 'sum', 'NPL_amount' : 'sum', 'diff' : 'mean'})
df.reset_index(level=0, inplace=True)
df['Vintage_Ratio']=df['NPL_amount']/df['Dis_amount']
table=pd.pivot_table(df,values='Vintage_Ratio',index='Dis_date_period',columns=['diff'],).fillna(0)
return
The above is the function
#for e in product_list:
# sub = data1[data1['product_types'] == e]
# print(sub)
consumer = data1[data1['product_types'] == product_list[0]]
mortgage = data1[data1['product_types'] == product_list[1]]
vehicle = data1[data1['product_types'] == product_list[2]]
table_con = vintage_table(consumer)
table_mor = vintage_table(mortgage)
table_veh = vintage_table(vehicle)
I would like to improve this part is there a better way to do the same process?
You could have your vintage_table() function return a dataframe instead of just modifying one dataframe over and over and that way you could do this in the second code block:
table_con = vintage_table(consumer)
table_mor = vintage_table(mortgage)
table_veh = vintage_table(vechicle)

Apply a function in a dataframe's columns [Python]

I just wrote this function to calculated the age's person based in two columns in a Python DataFrame. Unfortunately, if a use the return the function return the same value for all rows, but if I use the print statement the function gives me the right values.
Here is the code:
def calc_age(dataset):
index = dataset.index
for element in index:
year_nasc = train['DT_NASCIMENTO_BENEFICIARIO'][element][6:]
year_insc = train['ANO_CONCESSAO_BOLSA'][element]
age = int(year_insc) - int(year_nasc)
print ('Age: ', age)
#return age
train['DT_NASCIMENTO_BENEFICIARIO'] = 03-02-1987
train['ANO_CONCESSAO_BOLSA'] = 2009
What am I doing wrong?!
If what you want is to subtract the year of DT_NASCIMENTO_BENEFICIARIO from ANO_CONCESSAO_BOLSA, and df is your DataFrame:
# cast to datetime
df["DT_NASCIMENTO_BENEFICIARIO"] = pd.to_datetime(df["DT_NASCIMENTO_BENEFICIARIO"])
df["age"] = df["ANO_CONCESSAO_BOLSA"] - df["DT_NASCIMENTO_BENEFICIARIO"].dt.year
# print the result, or do something else with it:
print(df["age"])

iterate over list of dicts to create different strings

I have a pandas file with 3 different columns that I turn into a dictionary with to_dict, the result is a list of dictionaries:
df = [
{'HEADER1': 'col1-row1', 'HEADER2: 'col2-row1', 'HEADER3': 'col3-row1'},
{'HEADER1': 'col1-row2', 'HEADER2: 'col2-row2', 'HEADER3': 'col3-row2'}
]
Now my problem is that I need the value of 'col2-rowX' and 'col3-rowX' to build an URL and use requests and bs4 to scrape the websties.
I need my result to be something like the following:
requests.get("'http://www.website.com/' + row1-col2 + 'another-string' + row1-col3 + 'another-string'")
And i need to do that for every dictionary in the list.
I have tried iterating over the dictionaries using for-loops.
something like:
import pandas as pd
import os
os.chdir('C://Users/myuser/Desktop')
df = pd.DataFrame.from_csv('C://Users/myuser/Downloads/export.csv')
#Remove 'Code' column
df = df.drop('Code', axis=1)
#Remove 'Code2' as index
df = df.reset_index()
#Rename columns for easier manipulation
df.columns = ['CB', 'FC', 'PO']
#Convert to dictionary for easy URL iteration and creation
df = df.to_dict('records')
for row in df:
for key in row:
print(key)
You only ever iterate twice, and short-circuit out of the nested for loop every time it is executed by having a return statement there. Looking up the necessary information from the dictionary will allow you to build up your url's. One possible example:
def get_urls(l_d):
l=[]
for d in l_d:
l.append('http://www.website.com/' + d['HEADER2'] + 'another-string' + d['HEADER3'] + 'another-string')
return l
df = [{'HEADER1': 'col1-row1', 'HEADER2': 'col2-row1', 'HEADER3': 'col3-row1'},{'HEADER1': 'col1-row2', 'HEADER2': 'col2-row2', 'HEADER3': 'col3-row2'}]
print get_urls(df)
>>> ['http://www.website.com/col2-row1another-stringcol3-row1another-string', 'http://www.website.com/col2-row2another-stringcol3-row2another-string']

Categories

Resources