I've already seen the Replace a row by a new Dataframe solution, but It's pretty unclear to me.
I've got a DataFrame of results of some model with a MultiIndex at the name of the model and the mode (train/test) that I want to update with the last execution keeping the other model results (create the DataFrame if it doesn't exist or update the row with the same name and mode with the dic variable). Here is my code:
def save_results(dic, path = "../ModelsResults"):
try:
df_results = pd.read_pickle(path)
print("Updating ModelResults...")
df_now = pd.DataFrame.from_dict([dic])
if df_results.index.isin([(dic["Model"], dic["Mode"])]).any():
print("\tUpdating Model/Mode...")
df_now.drop(["Model", "Mode"],axis=1)
df_results.at[dic["Model"], dic["Mode"]] = df_now
else:
print("\tCreating Model/Mode...")
df_results = df_results.append(df_now)
except FileNotFoundError:
print("Creating ModelResults...")
df_results = pd.DataFrame.from_dict([dic])
df_results = df_results.set_index(["Model", "Mode"])
df_results.to_pickle(path)
print("Done")
return df_results
Every metric that I want to save is in the dic variable. For example:
dic = {
"Model": "Dummy-PredictingAlwaysZero",
"Mode": "Train",
"MSE": mse ,
"nRMSE": nrmse,
"nDCG#10": ndcg(train["rel"].values, y_pred, k = 10),
"nDCG#50": ndcg(train["rel"].values, y_pred, k = 50)
}
df_results = save_results(dic, path = "./ModelsResults")
And the expected DataFrame is like:
MSE nDCG#10 nDCG#50 nRMSE
Model Mode
Dummy-PredictingAlwaysZero Train 0.08639 0.162948 0.106816 0.293922
Related
I was making my automatic stock strategy yield calculation program with Python. Here's my code:
import FinanceDataReader as fdr
import numpy as np
# ...(more modules for python)
pd.options.display.float_format = '{:.5f}'.format
file_list = os.listdir('/home/sejahui/projects/stock_data_excel')
for i in range(20):
os.chdir('/home/sejahui/projects/stock_data_excel')
odd = file_list[i]
data = pd.read_excel('/home/sejahui/projects/stock_data_excel/'+str(odd))
def calMACD(data, short=5, long=25, signal=9):
data.sort_index()
data['MVA_25']=data['Close'].ewm(span=long, adjust=False).mean()
data['MVA_5']=data['Close'].ewm(span=short, adjust=False).mean()
data['MACD']=data['Close'].ewm(span=short, adjust=False).mean() - data['Close'].ewm(span=long, adjust=False).mean()
data['Signal']=data['MACD'].ewm(span=signal, adjust=False).mean( )
#data['Buy_sign']=(data['MACD']-data['Signal']) >=600
data['Buy_sign']=np.where(data['MACD']-data['Signal'] >=451, 'Buy' , 'Sell' )
#data['Target_1']=(data['Close']-data['Close'].shift(1))/data['Close'].shift(1)*100
#data['Target_1']=np.where(data['Buy_sign']=='Buy', (data['Change'])+1,1)
#data['Target_2']=np.where(data['Buy_sign']=='Sell', (data['Change'])+1,1)
#data['Real_world']= 1000000*data['Target_1']
#data['Real_world_2']= 1000000*data['Target_2']
#data['Condition'] = np.where(data['Real_world']<1000000, data['Real_world']-data['Real_world'].shift(-2),1)
##data['Condition_2'] = np.where(data['Real_world']<1000000, data['Target_1'].shift(-2),1)
#data['Moneyflow'] =
#plt.plot(data['Date'], data['Real_world'])
#data[data.Buy_sign !='Sell']
'''
data['Target_1']=np.where(data['Buy_sign']=='Buy', data['Change'],1)
data['Target_2']=np.where(data['Buy_sign']=='Sell', data ['Change'],1)
data['Yield']=np.where(data['Buy_sign']=='Sell', data['Target_1']/data['Target_2'],1 )
'''
'''
data['Result']=data['Target_1'].cumprod()
data['Result_2']=data['Target_2'].cumprod()
data['??????'] = data['Result'] - data['Result_2']
'''
return data
Adjusted = calMACD(data)
Adjusted.drop(['Change'], axis=1, inplace = True)
Filtered = Adjusted[Adjusted.Buy_sign!='Sell'].copy()
#print(Filtered)
#Filtered = (Adjusted.Buy_sign =='Buy') #(Adjusted.Condition = 1.0)
#Master = Adjusted.loc[Adjusted,['Date','Buy_sign','Target_1','Real_world',]]
#print(Adjusted)
def backtester(Filtered):
Filtered['Change'] = ((Filtered['Close'] - Filtered['Close'].shift(1)) / Filtered['Close'].shift(1))+1
#data['Target_1']=np.where(data['Buy_sign']=='Buy', (data['Change'])+1,1)
Filtered['Real_world'] = 1000000*Filtered['Change']
#Filtered['Condition'] = np.where(Filtered['Real_world']<1000000, Filtered['Real_world'].shift(-2)-Filtered['Real_world'],1)
Filtered['Condition'] = np.where(Filtered['Real_world']<1000000, Filtered['Change'].shift(-2),1)
#Filtered['Target_1'] = np.where(Filtered['Buy_sign']=='Buy', (Filtered['Change'])+1,1)
#Filtered['Condition'] = np.where(Filtered['Real_world']<1000000, Filtered['Real_world'].shift(-2)-Filtered['Real_world'],1)
return Filtered
s = backtester(Filtered)
e = s[s.Condition!=1.00000]
x = e.dropna()
y = x['Condition']
list_1 = []
write_wb = Workbook()
write_ws = write_wb.create_sheet('MACD&Signal gap data sheet')
write_ws = write_wb.active
write_ws['A1'] = 'Name'
write_ws['B1'] = 'Profit'
try:
print(geometric_mean(y)*1000000*12)
except StatisticsError as e:
print ('Sell is empty':',odd)
else:
d = (geometric_mean(y)*1000000*12)
print(d,odd)
list_1.insert(i,d)
Print(list_1)
Here's the part where I'm troubling with:
s = backtester(Filtered)
e = s[s.Condition!=1.00000]
x = e.dropna()
y = x['Condition']
list_1 = []
try:
print(geometric_mean(y)*1000000*12)
except StatisticsError as e:
print ('Sell is empty':',odd)
else:
d = (geometric_mean(y)*1000000*12)
print(d)
list_1.insert(d)
print(list_1)
When I initiate the code where I am having problems, list only saves the last result of 'try, except, else' function. My intention was saving all the results. What change should I give to save all the results?
Here's the output of the list:
[11772769.197974786]
Your problem is that you are using insert instead of append and the main difference that insert takes a second argument for the position that you want to insert your element at and when none is provided it is 0 by default so you are consistently inserting at the same index resulting in a list with only the last element at the first position.
To fix that simply use append instead.
else:
d = (geometric_mean(y)*1000000*12)
print(d)
list_1.append(d)
You want to use append, not insert. see Python Data Structures
Change list_1.insert(d) to list_1.append(d)
The insert is defaulting to index 0 and just updating it each time.
Edit: Just noticed your answer is in the question title.
Facing trouble creating a function to store the response in columns,
Like passing the city name and as the response getting details of max_temp, min_temp, Pressure. which I want to store that in the new column.
import pyowm
from pyowm.utils import config
from pyowm.utils import timestamps
api_key = {key from openweather(free)}
mgr = owm.weather_manager()
data =[]
def get_weather(city):
observation = mgr.weather_at_place(city)
l = observation.weather
Wind_Speed = l.wind()['speed']
Temp = l.temperature('celsius')['temp']
Max_temp = l.temperature('celsius')['temp_max']
Min_temp = l.temperature('celsius')['temp_min']
#Heat_index = l.heat_index
Humidity = l.humidity
Pressure = l.pressure['press']
weather = {"City": city,"Wind_Speed" : Wind_Speed, "Temp":
Temp,"Max_temp":Max_temp, "Min_temp":Min_temp, "Humidity":Humidity,
"Pressure":Pressure}
return weather
for city in df2['City']:
get_weather(city)
df = df.append(data, True)
Want to add each weather details as column based on city name
Want to create one function which stores all the details in columns,
don't want to create separate functions.
Data Frame is like:
You can return a dictionary from your fucntion.
def get_weather(city):
observation = mgr.weather_at_place(city)
l = observation.weather
Wind_Speed = l.wind()['speed']
Temp = l.temperature('celsius')['temp']
Max_temp = l.temperature('celsius')['temp_max']
resp=dict()
resp['Wind_Speed ']=Wind_Speed
resp['Temp']=Temp
resp['Max_temp']=Max_temp
return resp
df["Wind_speed"] = df["city"].apply(lambda x: get_weather(x)['Wind_Speed'])
df["Temp"] = df["city"].apply(lambda x: get_weather(x)['Temp'])
df["Max_temp"] = df["city"].apply(lambda x: get_weather(x)['Max_temp'])
I have written the following code that has a function model_data to perform a particular set of tasks. I have to pass the list of Badges and the type of category 1 or 2 along with an empty dataframe data.
But while running the code I am getting an error. I searched SO for answers but this type of Question was not found.
CODE
#Model Function
def model_data(badge_list, data):
for key, value in badge_list.items():
#Check for Post Type
if (value == 1):
badge_type = posts.loc[posts.PostTypeId == '1']
elif (value == 2):
badge_type = posts.loc[posts.PostTypeId == '2']
#Obtain required fields from Badge Data
badge_type = badge_type[['OwnerUserId', 'Id','Score', 'CreationDate']]
badge_type.columns = ['UserId', 'Id', 'Score','CreationDate']
Badge = key
#Obtain time when user first obtained Badge
badge_data = user_badge_dt(Badge)
#Find the number of posts made before and after 1 week of Badge Attainment
post_data = post_details(df1 = badge_data, df2 = badge_type)
post_data.date = pd.to_datetime(post_data.date)
#Calculate APR
post_data = APR(post_data)
#Calculate Score
post_data = score(df = post_data, post_type = badge_type)
#Generate Final Dataframe with Badge Count
data1 = badge_number(post_data)
data1 = data1[['1','2','3','date','Score','APR']]
#Append Dataframe
data = data.append(data1)
return data
#Function Call
questionBadge_list = {'Good Question':1, 'Explainer':2}
data = pd.DataFrame()
badge1_data = model_data(badge_list = questionBadge_list, data = data)
ERROR
IndexError: Too many levels: Index has only 1 level, not 2
ERROR LINE
The code line badge_data = user_badge_dt(Badge) gives this error so I am adding the complete function.
#Function to obtain UserId with the date-time of obtaining given badge for the first time
def user_badge_dt(badge):
#Creating DataFrame to obtain all UserId and date-Time of given badge
df = badges[['UserId','Date']].loc[badges.Name == badge]
#Obtaining the first date-time of badge attainment
v = df.groupby("UserId", group_keys=False)['Date'].nsmallest(1)
v.index = v.index.droplevel(1)
df['date'] = df['UserId'].map(v)
df.drop(columns='Date',inplace=True)
#Removing all duplicate values of Users
df.drop_duplicates(subset='UserId', inplace=True )
return df
I am getting a key error while converting the variables using onehot encoder. This is the code that i used:
def preprocessor(df):
res_df = df.copy()
le = preprocessing.LabelEncoder()
res_df['"job"'] = le.fit_transform(res_df['"job"'])
res_df['"marital"'] = le.fit_transform(res_df['"marital"'])
res_df['"education"'] = le.fit_transform(res_df['"education"'])
res_df['"default"'] = le.fit_transform(res_df['"default"'])
res_df['"housing"'] = le.fit_transform(res_df['"housing"'])
res_df['"month"'] = le.fit_transform(res_df['"month"'])
res_df['"loan"'] = le.fit_transform(res_df['"loan"'])
res_df['"contact"'] = le.fit_transform(res_df['"contact"'])
res_df['"day_of_week"'] = le.fit_transform(res_df['"day"'])
res_df['"poutcome"'] = le.fit_transform(res_df['"poutcome"'])
res_df['"y"'] = le.fit_transform(res_df['"y"'])
return res_df
while executing the function the function, i am getting a key error
encoded_df = preprocessor(df1)
x = encoded_df.drop(['"y"'],axis =1).values
y = encoded_df['"y"'].values
while executing the function the function, i am getting a key error, although i have split the column using sep=';'. Can anyone please help
I'm trying to filter my CIFAR-100 ndarray by class index, here is my code:
def get_cifar100(folder, class_idx):
train_fname = os.path.join(folder, 'train')
test_fname = os.path.join(folder, 'test')
data_dict = unpickle(train_fname)
train_data = data_dict['data']
train_fine_labels = data_dict['fine_labels']
train_coarse_labels = data_dict['coarse_labels']
# Filtering process
filt_tdata = numpy.empty((0))
for i, v in enumerate(train_coarse_labels):
if v == class_idx:
filt_tdata = numpy.append(filt_tdata, train_data[i])
data_dict = unpickle(test_fname)
test_data = data_dict['data']
test_fine_labels = data_dict['fine_labels']
test_coarse_labels = data_dict['coarse_labels']
bm = unpickle(os.path.join(folder, 'meta'))
clabel_names = bm['coarse_label_names']
flabel_names = bm['fine_label_names']
return data_dict, filt_tdata, numpy.array(train_coarse_labels), numpy.array(train_fine_labels), test_data, numpy.array(test_coarse_labels), numpy.array(test_fine_labels), clabel_names, flabel_names
datapath = "./data/cifar-100-python"
data_dict, tr_data100, tr_clabels100, tr_flabels100, te_data100, te_clabels100, te_flabels100, clabel_names100, flabel_names100 = get_cifar100(datapath, 4)
print(len(tr_data100))
I want to filter train_data based on class_idx = 4 (train_coarse_labels). The size of original array is 50000 and it should be 5000 on filtered. But, I got more than its original size (7 million ++). What's wrong with my function? Thanks.