From the code given here, I have developed another code which uses Matplotlib in place of Seaborn (The data are plotted on several figures and subplots, and so are now more readable and I am closer to the point I want to reach: the user by putting the cursor over a point has access to all the information of the point, in particular the datetime.)
Here it is:
import pandas as pd
import numpy as np
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import random
from datetime import datetime
# size of the database
n = 1000
nA = 4
nB = 9
no = np.arange(n)
date = np.random.randint(1e9, size=n).astype('datetime64[s]')
A = [''.join(['A',str(random.randint(1, nA))]) for j in range(n)]
B = [''.join(['B',str(random.randint(1, nB))]) for j in range(n)]
Epsilon1 = np.random.random_sample((n,))
Epsilon2 = np.random.random_sample((n,))
Epsilon3 = np.random.random_sample((n,))
data = pd.DataFrame({'no':no,
'Date':date,
'A':A,
'B':B,
'Epsilon1':Epsilon1,
'Epsilon2':Epsilon2,
'Epsilon3':Epsilon3})
def format_coord(x, y):
string_x = datetime.utcfromtimestamp(x).strftime("%m/%d/%Y, %H:%M:%S")
return 'x={}, y={:.4f}'.format(string_x,y)
def plot_Epsilon_matplotlib():
for A in data['A'].sort_values().drop_duplicates().to_list():
n_col = 2
fig, axes = plt.subplots(np.ceil(nB/n_col).astype(int),n_col)
for j, B in enumerate(data['B'].sort_values().drop_duplicates().to_list()):
df = data.loc[(data['A']==A) & (data['B']==B)]
df = df.sort_values("Date", ascending=True)
axes.flatten()[j].plot(df["Date"],df['Epsilon1'],marker='x',c='b',label="Epsilon1")
axes.flatten()[j].plot(df["Date"],df['Epsilon2'],marker='x',c='r',label="Epsilon2")
axes.flatten()[j].plot(df["Date"],df['Epsilon3'],marker='x',c='g',label="Epsilon3")
axes.flatten()[j].format_coord = format_coord
if __name__ == '__main__':
plot_Epsilon_matplotlib()
The goal is that when the user puts the cursor over a point, he gets access to the full datetime of the data.
I have first tried to change the major formatter (as here):
axes.flatten()[j].xaxis.set_major_formatter(mdates.DateFormatter('%Y/%m/%d %H:%M:%S'))
but then the x ticks are not readable (especially if the user zooms on a subplot)
I then tried the define my own format_coord as here. My first try is given in the full code given above. The format of the datetime in Matplotlib figure status bar is good but the date remains in 1970 !
After reading this discussion, I realized this problem relates on Numpy datetime64 to Datetime conversion. I then coded this new version of format_coord (strongly inspired from this answer):
def format_coord_bis(x,y):
dt64 = np.datetime64(datetime.utcfromtimestamp(x))
unix_epoch = np.datetime64(0, 's')
one_second = np.timedelta64(1, 's')
seconds_since_epoch = (dt64 - unix_epoch) / one_second
string_x = datetime.utcfromtimestamp(seconds_since_epoch).strftime("%m/%d/%Y, %H:%M:%S")
return 'x={}, y={:.4f}'.format(string_x,y)
but the date given in the status bar remains the 01/01/1970...
I have found the solution from this answer.
The function format_coord() should be defined as follows:
def format_coord(x, y):
string_x = matplotlib.dates.num2date(x).strftime('%Y-%m-%d %H:%M:%S')
return 'x={}, y={:.4f}'.format(string_x,y)
I have a problem with my code - Somehow it keeps giving me a keyerror: "None of [Float]...."
I need to calculate: P_mid = P_offer+P_bid/2
and
volume weightet mid_price = VWMP = (P_bid * Size_offer)+(P_offer * Size_bid)/Size_Offer+Size_Bid
So far my code looks like this:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
nasdaq_1 = pd.read_csv (r'Path to csv')
np.array(nasdaq_1)
#print(nasdaq_1)
mid_price = (np.array(nasdaq_1.Offer_Price) + np.array(nasdaq_1.Bid_Price))/2
#print(mid_price)
weightet_mid_price = (np.array(nasdaq_1.Offer_Price)*np.array(nasdaq_1.Bid_Size) + np.array(nasdaq_1.Bid_Price)*np.array(nasdaq_1.Offer_Size))/(np.array(nasdaq_1.Offer_Size)+np.array(nasdaq_1.Bid_Size))
print(weightet_mid_price)
nasdaq_1[mid_price].plot()
plt.figure(figsize=(10,10))
plt.plot(nasdaq_1.index, nasdaq_1[mid_price])
plt.xlabel("Datetime")
plt.ylabel("$ price")
plt.title("Mid-price between bid and offer prices")
All help is highly appreciated!!
CSV data sample:
|DateTime,Time,Exchange,Symbol,Bid_Price,Bid_Size,Offer_Price,Offer_Size
|2017-01-03 09:30:00,93000766290000.0,T,PFE,32.55,8.0,32.76,8.0
|2017-01-03 09:30:01,93001992610000.0,T,PFE,32.67,8.0,32.7,31.0
|2017-01-03 09:30:02,93002933311000.0,T,PFE,32.67,7.0,32.7,2.0
|2017-01-03 09:30:03,93003882764000.0,T,PFE,32.7,1.0,32.76,17.0
|2017-01-03 09:30:04,93004943608000.0,T,PFE,32.7,1.0,32.73,13.0
|2017-01-03 09:30:05,93005991747000.0,T,PFE,32.69,2.0,32.74,41.0
|2017-01-03 09:30:06,93006506218000.0,T,PFE,32.67,5.0,32.74,41.0
Image shows the data I am using. Screenshot below.
You do not need to cast the data frame columns into numpy arrays for your calculations.
The error you see is due to the line nasdaq_1[mid_price].plot().
df[x] expects x to be either a column name or a list/array of columns. You are passing a numpy array with entries which cannot be found.
Try the following:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
s = io.StringIO("""DateTime,Time,Exchange,Symbol,Bid_Price,Bid_Size,Offer_Price,Offer_Size
2017-01-03 09:30:00,93000766290000.0,T,PFE,32.55,8.0,32.76,8.0
2017-01-03 09:30:01,93001992610000.0,T,PFE,32.67,8.0,32.7,31.0
2017-01-03 09:30:02,93002933311000.0,T,PFE,32.67,7.0,32.7,2.0
2017-01-03 09:30:03,93003882764000.0,T,PFE,32.7,1.0,32.76,17.0
2017-01-03 09:30:04,93004943608000.0,T,PFE,32.7,1.0,32.73,13.0
2017-01-03 09:30:05,93005991747000.0,T,PFE,32.69,2.0,32.74,41.0
2017-01-03 09:30:06,93006506218000.0,T,PFE,32.67,5.0,32.74,41.0
""")
nasdaq_1 = pd.read_csv(s, parse_dates=['DateTime'])
mid_price = (nasdaq_1["Offer_Price"] + nasdaq_1["Bid_Price"])/2
weightet_mid_price = (
(nasdaq_1["Offer_Price"]*nasdaq_1["Bid_Size"] + nasdaq_1["Bid_Price"]*nasdaq_1["Offer_Size"])
/ (nasdaq_1["Offer_Size"] + nasdaq_1["Bid_Size"])
)
fig, ax = plt.subplots(figsize=(10,10))
ax.plot(nasdaq_1["DateTime"], mid_price)
ax.set_xlabel("Datetime")
ax.set_ylabel("$ price")
ax.set_title("Mid-price between bid and offer prices")
fig.autofmt_xdate()
Edit:
Parse the DateTime column to make it datetime values instead of strings.
We are currently using 3 different dataframes to store product,performance and assortments data.
The foreign key relationship is maintained between all the dimensions.
I need to update the cost column in the performance by doing the below math operation
performance['cost'] = performance['coulmn1']+sin(product['column3'])+2*Assortment['column2']
I need this operation to be performed for each row of the performance dataframe.
Please suggest any approach to make the calculations faster.
The performance dataframe consists of 1 million records.
Can we use any other approach rather than dataframe??
One approach could be to pre-calculate the sin function for the entire column. I used 1million records for it and it was pretty fast for me.
import pandas as pd
import math
import numpy as np
product = pd.DataFrame(columns=['col3', 'sincol3'])
product['col3'] = np.random.randn(1000000)
s = pd.Series(product['col3'])
product['sincol3'] = np.sin(s)
performance = pd.DataFrame(columns=['col1', 'cost'])
performance['col1'] = np.random.randn(1000000)
assortments = pd.DataFrame(columns=['col2'])
assortments['col2'] = np.random.randn(1000000)
performance['cost'] = performance['col1']+ product['sincol3'] + 2*assortments['col2']
print(performance)
It gives you output like:
col1 cost
0 0.194011 -1.940931
1 0.535375 1.891468
Edit after comments:
You have to understand that the expression on its own do not take a lot of time to calculate. If the expression is the only thing you are doing at run time (given your data frames already have values). Lets compare an example with 50 runs.
Example:
import pandas as pd
import math
import numpy as np
import time
def cal():
performance['cost'] = performance['col1']+ product['sincol3'] + 2*assortments['col2']
execution_time = []
product = pd.DataFrame(columns=['col3', 'sincol3'])
product['col3'] = np.random.randn(1000000)
s = pd.Series(product['col3'])
product['sincol3'] = np.sin(s)
performance = pd.DataFrame(columns=['col1', 'cost'])
performance['col1'] = np.random.randn(1000000)
assortments = pd.DataFrame(columns=['col2'])
assortments['col2'] = np.random.randn(1000000)
for i in range(0,50):
start_time = time.time()
cal()
execution_time.append(time.time() - start_time)
print('average time taken:\n')
print(np.mean(execution_time))
Gives me: average time taken: 0.15080997943878174
At the same time:
import pandas as pd
import math
import numpy as np
import time
def cal():
execution_time = []
product = pd.DataFrame(columns=['col3', 'sincol3'])
product['col3'] = np.random.randn(1000000)
s = pd.Series(product['col3'])
product['sincol3'] = np.sin(s)
performance = pd.DataFrame(columns=['col1', 'cost'])
performance['col1'] = np.random.randn(1000000)
assortments = pd.DataFrame(columns=['col2'])
assortments['col2'] = np.random.randn(1000000)
performance['cost'] = performance['col1']+ product['sincol3'] + 2*assortments['col2']
for i in range(0,50):
start_time = time.time()
cal()
execution_time.append(time.time() - start_time)
print('average time taken:\n')
print(np.mean(execution_time))
Gives me average time taken: 0.5624121456611447
I guess this is supposed to be simple.. But I cant seem to make it work.
I have some stock data
import pandas as pd
import numpy as np
df = pd.DataFrame(index=pd.date_range(start = "06/01/2018", end = "08/01/2018"),
data = np.random.rand(62)*100)
I am doing some analysis on it, this results of my drawing some lines on the graph.
And I want to plot a 45 line somewhere on the graph as a reference for lines I drew on the graph.
What I have tried is
x = df.tail(len(df)/20).index
x = x.reset_index()
x_first_val = df.loc[x.loc[0].date].adj_close
In order to get some point and then use slope = 1 and calculate y values.. but this sounds all wrong.
Any ideas?
Here is a possibility:
import pandas as pd
import numpy as np
df = pd.DataFrame(index=pd.date_range(start = "06/01/2018", end = "08/01/2018"),
data=np.random.rand(62)*100,
columns=['data'])
# Get values for the time:
index_range = df.index[('2018-06-18' < df.index) & (df.index < '2018-07-21')]
# get the timestamps in nanoseconds (since epoch)
timestamps_ns = index_range.astype(np.int64)
# convert it to a relative number of days (for example, could be seconds)
time_day = (timestamps_ns - timestamps_ns[0]) / 1e9 / 60 / 60 / 24
# Define y-data for a line:
slope = 3 # unit: "something" per day
something = time_day * slope
trendline = pd.Series(something, index=index_range)
# Graph:
df.plot(label='data', alpha=0.8)
trendline.plot(label='some trend')
plt.legend(); plt.ylabel('something');
which gives:
edit - first answer, using dayofyear instead of the timestamps:
import pandas as pd
import numpy as np
df = pd.DataFrame(index=pd.date_range(start = "06/01/2018", end = "08/01/2018"),
data=np.random.rand(62)*100,
columns=['data'])
# Define data for a line:
slope = 3 # unit: "something" per day
index_range = df.index[('2018-06-18' < df.index) & (df.index < '2018-07-21')]
dayofyear = index_range.dayofyear # it will not work around the new year...
dayofyear = dayofyear - dayofyear[0]
something = dayofyear * slope
trendline = pd.Series(something, index=index_range)
# Graph:
df.plot(label='data', alpha=0.8)
trendline.plot(label='some trend')
plt.legend(); plt.ylabel('something');
I was wondering how one rotates the x-labels, something in the lines of:
theme(axis.text.x = element_text(angle = 90, hjust = 1))
in ggplot?
Thank you.
Plotnine is basically a clone of ggplot, you can call (almost) exactly that.
Here's an example :
import pandas as pd
from datetime import datetime, timedelta
from plotnine import ggplot, geom_point, aes, theme, element_text
now = datetime.now()
ago_28days = now - timedelta(days=28)
delta = now - ago_28days
timestamps = [ago_28days + timedelta(days=i) for i in range(delta.days)]
df = pd.DataFrame(data={'timestamp': timestamps, 'value':list(range(28))})
(ggplot(df) +
geom_point(aes('timestamp', 'value')) +
theme(axis_text_x=element_text(rotation=90, hjust=1))
)