heatmap of values grouped by time - seaborn - python

I'm plotting the counts of a variable grouped by time as a heatmap. However, when including both hour and minute, the counts are quite low so the resulting heatmap doesn't really provide any real insight. Is it possible to group the counts in a bigger block of time? I'm hoping to test some different periods (5, 10 mins).
I'm also hoping to plot time on the x-axis. Similar to the output attached.
import seaborn as sns
import pandas as pd
from datetime import datetime
from datetime import timedelta
start = datetime(1900,1,1,10,0,0)
end = datetime(1900,1,1,13,0,0)
seconds = (end - start).total_seconds()
step = timedelta(minutes = 1)
array = []
for i in range(0, int(seconds), int(step.total_seconds())):
array.append(start + timedelta(seconds=i))
array = [i.strftime('%Y-%m-%d %H:%M%:%S') for i in array]
df2 = pd.DataFrame(array).rename(columns = {0:'Time'})
df2['Count'] = np.random.uniform(0.0, 0.5, size = len(df2))
df2['Count'] = df2['Count'].round(1)
df2['Time'] = pd.to_datetime(df2['Time'])
df2['Hour'] = df2['Time'].dt.hour
df2['Min'] = df2['Time'].dt.minute
g = df2.groupby(['Hour','Min','Count'])
count_df = g['Count'].nunique().unstack()
count_df.fillna(0, inplace = True)
sns.heatmap(count_df)

To deal with such cases, I think it would be easy to use data downsampling. It is also easy to change the thresholds. The axis labels in the output graph will need to be modified, but we recommend this method.
import seaborn as sns
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
start = datetime(1900,1,1,10,0,0)
end = datetime(1900,1,1,13,0,0)
seconds = (end - start).total_seconds()
step = timedelta(minutes = 1)
array = []
for i in range(0, int(seconds), int(step.total_seconds())):
array.append(start + timedelta(seconds=i))
array = [i.strftime('%Y-%m-%d %H:%M:%S') for i in array]
df2 = pd.DataFrame(array).rename(columns = {0:'Time'})
df2['Count'] = np.random.uniform(0.0, 0.5, size = len(df2))
df2['Count'] = df2['Count'].round(1)
df2['Time'] = pd.to_datetime(df2['Time'])
df2['Hour'] = df2['Time'].dt.hour
df2['Min'] = df2['Time'].dt.minute
df2.set_index('Time', inplace=True)
count_df = df2.resample('10min')['Count'].value_counts().unstack()
count_df.fillna(0, inplace = True)
sns.heatmap(count_df.T)

The way you could achieve this is by creating a column with numbers that have repeating elements for the number of minutes.
For example:
minutes = 3
x = [0,1,2]
np.repeat(x, repeats=minutes, axis=0)
>>>> [0,0,0,1,1,1,2,2,2]
and then group your data using this column.
So your code would look like:
...
minutes = 5
x = [i for i in range(int(df2.shape[0]/5))]
df2['group'] = np.repeat(x, repeats=minutes, axis=0)
g = df2.groupby(['Min', 'Count'])
count_df = g['Count'].nunique().unstack()
count_df.fillna(0, inplace = True)

Related

Python plotly how to change X and Y axis in button

I'm working on a graph that ilustrates computer usage each day. I want to have a button that will group dates monthly for last year and set y as AVERAGE (mean) and draw avg line.
My code:
import datetime
import numpy as np
import pandas as pd
import plotly.graph_objects as go
example_data = {"date": ["29/07/2022", "30/07/2022", "31/07/2022", "01/08/2022", "02/08/2022"],
"time_spent" : [15840, 21720, 40020, 1200, 4200]}
df = pd.DataFrame(example_data)
df["date"] = pd.to_datetime(df["date"], dayfirst=True)
df['Time spent'] = df['time_spent'].apply(lambda x:str(datetime.timedelta(seconds=x)))
df['Time spent'] = pd.to_datetime(df['Time spent'])
df = df.drop("time_spent", axis=1)
dfall = df.resample("M", on="date").mean().copy()
dfyearly = dfall.tail(12).copy()
dfweekly = df.tail(7).copy()
dfmonthly = df.tail(30).copy()
del df
dfs = {'Week':dfweekly, 'Month': dfmonthly, 'Year' : dfyearly, "All" : dfall}
for dframe in list(dfs.values()):
dframe['StfTime'] = dframe['Time spent'].apply(lambda x: x.strftime("%H:%M"))
frames = len(dfs) # number of dataframes organized in dict
columns = len(dfs['Week'].columns) - 1 # number of columns i df, minus 1 for Date
scenarios = [list(s) for s in [e==1 for e in np.eye(frames)]]
visibility = [list(np.repeat(e, columns)) for e in scenarios]
lowest_value = datetime.datetime.combine(datetime.date.today(), datetime.datetime.min.time())
highest_value = dfweekly["Time spent"].max().ceil("H")
buttons = []
fig = go.Figure()
for i, (period, df) in enumerate(dfs.items()):
print(i)
for column in df.columns[1:]:
fig.add_bar(
name = column,
x = df['date'],
y = df[column],
customdata=df[['StfTime']],
text=df['StfTime'],
visible=True if period=='Week' else False # 'Week' values are shown from the start
)
#Change display data to more friendly format
fig.update_traces(textfont=dict(size=20), hovertemplate='<b>Time ON</b>: %{customdata[0]}</br>')
#Change range for better scalling
this_value =df["Time spent"].max().ceil("H")
if highest_value <= this_value:
highest_value = this_value
fig.update_yaxes(range=[lowest_value, highest_value])
#Add average value indicator
average_value = df["Time spent"].mean()
fig.add_hline(y=average_value, line_width=3, line_dash="dash",
line_color="green")
# one button per dataframe to trigger the visibility
# of all columns / traces for each dataframe
button = dict(label=period,
method = 'restyle',
args = ['visible',visibility[i]])
buttons.append(button)
fig.update_yaxes(dtick=60*60*1000, tickformat='%H:%M')
fig.update_xaxes(type='date', dtick='D1')
fig.update_layout(updatemenus=[dict(type="dropdown",
direction="down",
buttons = buttons)])
fig.show()
EDIT 1.
Thanks to vestland I managed to get semi-working dropdown.
The problem is that the line added with add_hline affect all bar charts. I want it to display only on the chart that it had been added for. Also after passing in custom data for nicer display, the space between bars is doubled. Any way to fix above issues?

Python's `.loc` is really slow on selecting subsets of Data

I'm having a large multindexed (y,t) single valued DataFrame df. Currently, I'm selecting a subset via df.loc[(Y,T), :] and create a dictionary out of it. The following MWE works, but the selection is very slow for large subsets.
import numpy as np
import pandas as pd
# Full DataFrame
y_max = 50
Y_max = range(1, y_max+1)
t_max = 100
T_max = range(1, t_max+1)
idx_max = tuple((y,t) for y in Y_max for t in T_max)
df = pd.DataFrame(np.random.sample(y_max*t_max), index=idx_max, columns=['Value'])
# Create Dictionary of Subset of Data
y1 = 4
yN = 10
Y = range(y1, yN+1)
t1 = 5
tN = 9
T = range(t1, tN+1)
idx_sub = tuple((y,t) for y in Y for t in T)
data_sub = df.loc[(Y,T), :] #This is really slow
dict_sub = dict(zip(idx_sub, data_sub['Value']))
# result, e.g. (y,t) = (5,7)
dict_sub[5,7] == df.loc[(5,7), 'Value']
I was thinking of using df.loc[(y1,t1),(yN,tN), :], but it does not work properly, as the second index is only bounded in the final year yN.
One idea is use Index.isin with itertools.product in boolean indexing:
from itertools import product
idx_sub = tuple(product(Y, T))
dict_sub = df.loc[df.index.isin(idx_sub),'Value'].to_dict()
print (dict_sub)

Dataframe with Monte Carlo Simulation calculation next row Problem

I want to build up a Dataframe from scratch with calculations based on the Value before named Barrier option. I know that i can use a Monte Carlo simulation to solve it but it just wont work the way i want it to.
The formula is:
Value in row before * np.exp((r-sigma**2/2)*T/TradingDays+sigma*np.sqrt(T/TradingDays)*z)
The first code I write just calculates the first column. I know that I need a second loop but can't really manage it.
The result should be, that for each simulation it will calculate a new value using the the value before, for 500 Day meaning S_1 should be S_500 with a total of 1000 simulations. (I need to generate new columns based on the value before using the formular.)
similar to this:
So for the 1. Simulations 500 days, 2. Simulation 500 day and so on...
import numpy as np
import pandas as pd
from scipy.stats import norm
import random as rd
import math
simulation = 0
S_0 = 42
T = 2
r = 0.02
sigma = 0.20
TradingDays = 500
df = pd.DataFrame()
for i in range (0,TradingDays):
z = norm.ppf(rd.random())
simulation = simulation + 1
S_1 = S_0*np.exp((r-sigma**2/2)*T/TradingDays+sigma*np.sqrt(T/TradingDays)*z)
df = df.append ({
'S_1':S_1,
'S_0':S_0
}, ignore_index=True)
df = df.round ({'Z':6,
'S_T':2
})
df.index += 1
df.index.name = 'Simulation'
print(df)
I found another possible code which i found here and it does solve the problem but just for one row, the next row is just the same calculation. Generate a Dataframe that follow a mathematical function for each column / row
If i just replace it with my formular i get the same problem.
replacing:
exp(r - q * sqrt(sigma))*T+ (np.random.randn(nrows) * sqrt(deltaT)))
with:
exp((r-sigma**2/2)*T/nrows+sigma*np.sqrt(T/nrows)*z))
import numpy as np
import pandas as pd
from scipy.stats import norm
import random as rd
import math
S_0 = 42
T = 2
r = 0.02
sigma = 0.20
TradingDays = 50
Simulation = 100
df = pd.DataFrame({'s0': [S_0] * Simulation})
for i in range(1, TradingDays):
z = norm.ppf(rd.random())
df[f's{i}'] = df.iloc[:, -1] * np.exp((r-sigma**2/2)*T/TradingDays+sigma*np.sqrt(T/TradingDays)*z)
print(df)
I would work more likely with the last code and solve the problem with it.
How about just overwriting the value of S_0 by the new value of S_1 while you loop and keeping all simulations in a list?
Like this:
import numpy as np
import pandas as pd
import random
from scipy.stats import norm
S_0 = 42
T = 2
r = 0.02
sigma = 0.20
trading_days = 50
output = []
for i in range(trading_days):
z = norm.ppf(random.random())
value = S_0*np.exp((r - sigma**2 / 2) * T / trading_days + sigma * np.sqrt(T/trading_days) * z)
output.append(value)
S_0 = value
df = pd.DataFrame({'simulation': output})
Perhaps I'm missing something, but I don't see the need for a second loop.
Also, this eliminates calling df.append() in a loop, which should be avoided. (See here)
Solution based on the the answer of bartaelterman, thank you very much!
import numpy as np
import pandas as pd
from scipy.stats import norm
import random as rd
import math
#Dividing the list in chunks to later append it to the dataframe in the right order
def chunk_list(lst, chunk_size):
for i in range(0, len(lst), chunk_size):
yield lst[i:i + chunk_size]
def blackscholes():
d1 = ((math.log(S_0/K)+(r+sigma**2/2)*T)/(sigma*np.sqrt(2)))
d2 = ((math.log(S_0/K)+(r-sigma**2/2)*T)/(sigma*np.sqrt(2)))
preis_call_option = S_0*norm.cdf(d1)-K*np.exp(-r*T)*norm.cdf(d2)
return preis_call_option
K = 40
S_0 = 42
T = 2
r = 0.02
sigma = 0.2
U = 38
simulation = 10000
trading_days = 500
trading_days = trading_days -1
#creating 2 lists for the first and second loop
loop_simulation = []
loop_trading_days = []
#first loop calculates the first column in a list
for j in range (0,simulation):
print("Progressbar_1_2 {:2.2%}".format(j / simulation), end="\n\r")
S_Tag_new = 0
NORM_S_INV = norm.ppf(rd.random())
S_Tag = S_0*np.exp((r-sigma**2/2)*T/trading_days+sigma*np.sqrt(T/trading_days)*NORM_S_INV)
S_Tag_new = S_Tag
loop_simulation.append(S_Tag)
#second loop calculates the the rows for the columns in a list
for i in range (0,trading_days):
NORM_S_INV = norm.ppf(rd.random())
S_Tag = S_Tag_new*np.exp((r-sigma**2/2)*T/trading_days+sigma*np.sqrt(T/trading_days)*NORM_S_INV)
loop_trading_days.append(S_Tag)
S_Tag_new = S_Tag
#values from the second loop will be divided in number of Trading days per Simulation
loop_trading_days_chunked = list(chunk_list(loop_trading_days,trading_days))
#First dataframe with just the first results from the firstloop for each simulation
df1 = pd.DataFrame({'S_Tag 1': loop_simulation})
#Appending the the chunked list from the second loop to a second dataframe
df2 = pd.DataFrame(loop_trading_days_chunked)
#Merging both dataframe into one
df3 = pd.concat([df1, df2], axis=1)

np.log returns a dataframe full of NaNs

I have made 2 functions, one for the cumulative logarithmic returns and the other for the total relative return.
Cumulative logarithmic returns:
# Cumulative logarithmic returns function:
def tlog_r(data, start, end):
tlog_return = copy.deepcopy(data)
for t in range(0,len(tlog_return)):
x = data[t]
y = data[0]
tlog_return[t] = x/y
tlog_return = np.log(tlog_return)
tlog_return[0] = 0
return tlog_return`
Total relative returns:
# Total relative returns function:
def tr_rel(data):
tlog_return = copy.deepcopy(data)
for t in range(0,len(tlog_return)):
x = data[t]
y = data[0]
tlog_return[t] = x/y
tlog_return = np.log(tlog_return)
tlog_return[0] = 0
tr_relative = copy.deepcopy(tlog_return)
for t in range(0,len(tr_relative)):
tr_relative[t] = 100*(np.exp(tr_relative[t])-1)
print(tr_relative)
return tr_relative`
I want to calculate them from a dataframe of a stock between 2 dates.
It doesn't give any error but if dates don't start in 2000, 2005 or 2011 it returns a dataframe full of NaNs except for the value in index [0].
Why is this happening? How can I solve it?
In case you need it, this is the part of the code where I call the functions:
from relative_returns_functions import tlog_r, tr_rel
from pandas_datareader import data
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import copy
ticker='AAPL'
start_date='2000-01-01'
end_date='2019-12-31'
price='Close'
# Program
panel_data = data.DataReader(ticker , 'yahoo', start_date, end_date)[price]
title = '{} {} price'.format(ticker, price) #Plot title
panel_data.plot(title=title)
# Data procesing
all_weekdays = pd.date_range(start=start_date, end=end_date, freq='B')
panel_data = panel_data.reindex(all_weekdays)
panel_data = panel_data.fillna(method='ffill')
# Plot
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10,6))
comp_title = '{} returns comparation'.format(ticker)
fig.suptitle(comp_title)
sum_log_returns = tlog_r(panel_data, start_date, end_date)
ax1.plot(sum_log_returns.index, sum_log_returns, label=ticker)
ax1.set_ylabel('Cumulative log returns')
ax1.legend(loc='best')
tot_logreturns = tr_rel(panel_data)
ax2.plot(tot_logreturns.index, tot_logreturns, label=ticker)
ax2.set_ylabel('Total relative returns (%)')
ax2.legend(loc='best')
plt.show()
Here you have a minimal reproducible example, you will have to import the functions, pandas, numpy and copy.
ticker='AAPL'
start_date='2000-01-01'
end_date='2019-12-31'
price='Close'
panel_data = data.DataReader(ticker , 'yahoo', start_date, end_date)[price]
all_weekdays = pd.date_range(start=start_date, end=end_date, freq='B')
panel_data = panel_data.reindex(all_weekdays)
panel_data = panel_data.fillna(method='ffill')
sum_log_returns = tlog_r(panel_data, start_date, end_date)
print(sum_log_returns)
tot_logreturns = tr_rel(panel_data)
print(tot_logreturns)

Adding a 45 degree line to a time series stock data plot

I guess this is supposed to be simple.. But I cant seem to make it work.
I have some stock data
import pandas as pd
import numpy as np
df = pd.DataFrame(index=pd.date_range(start = "06/01/2018", end = "08/01/2018"),
data = np.random.rand(62)*100)
I am doing some analysis on it, this results of my drawing some lines on the graph.
And I want to plot a 45 line somewhere on the graph as a reference for lines I drew on the graph.
What I have tried is
x = df.tail(len(df)/20).index
x = x.reset_index()
x_first_val = df.loc[x.loc[0].date].adj_close
In order to get some point and then use slope = 1 and calculate y values.. but this sounds all wrong.
Any ideas?
Here is a possibility:
import pandas as pd
import numpy as np
df = pd.DataFrame(index=pd.date_range(start = "06/01/2018", end = "08/01/2018"),
data=np.random.rand(62)*100,
columns=['data'])
# Get values for the time:
index_range = df.index[('2018-06-18' < df.index) & (df.index < '2018-07-21')]
# get the timestamps in nanoseconds (since epoch)
timestamps_ns = index_range.astype(np.int64)
# convert it to a relative number of days (for example, could be seconds)
time_day = (timestamps_ns - timestamps_ns[0]) / 1e9 / 60 / 60 / 24
# Define y-data for a line:
slope = 3 # unit: "something" per day
something = time_day * slope
trendline = pd.Series(something, index=index_range)
# Graph:
df.plot(label='data', alpha=0.8)
trendline.plot(label='some trend')
plt.legend(); plt.ylabel('something');
which gives:
edit - first answer, using dayofyear instead of the timestamps:
import pandas as pd
import numpy as np
df = pd.DataFrame(index=pd.date_range(start = "06/01/2018", end = "08/01/2018"),
data=np.random.rand(62)*100,
columns=['data'])
# Define data for a line:
slope = 3 # unit: "something" per day
index_range = df.index[('2018-06-18' < df.index) & (df.index < '2018-07-21')]
dayofyear = index_range.dayofyear # it will not work around the new year...
dayofyear = dayofyear - dayofyear[0]
something = dayofyear * slope
trendline = pd.Series(something, index=index_range)
# Graph:
df.plot(label='data', alpha=0.8)
trendline.plot(label='some trend')
plt.legend(); plt.ylabel('something');

Categories

Resources