Can anybody help how to optimize the plot function in python? I use Matplotlib to plot financial data.Here small function for plotting OHLC data. The time increase significantly if I add indicators or other data.
import numpy as np
import datetime
from matplotlib.collections import LineCollection
from pylab import *
import urllib2
def test_plot(OHLCV):
bar_width = 1.3
date_offset = 0.5
fig = figure(figsize=(50, 20), facecolor='w')
ax = fig.add_subplot(1, 1, 1)
labels = ax.get_xmajorticklabels()
setp(labels, rotation=0)
month = MonthLocator()
day = DayLocator()
timeFmt = DateFormatter('%Y-%m-%d')
colormap = OHLCV[:,1] < OHLCV[:,4]
color = np.zeros(colormap.__len__(), dtype = np.dtype('|S5'))
color[:] = 'red'
color[np.where(colormap)] = 'green'
dates = date2num( OHLCV[:,0])
lines_hl = LineCollection( zip(zip(dates, OHLCV[:,2]), zip(dates, OHLCV[:,3])))
lines_hl.set_color(color)
lines_hl.set_linewidth(bar_width)
lines_op = LineCollection( zip(zip((np.array(dates) - date_offset).tolist(), OHLCV[:,1]), zip((np.array(dates)).tolist(), parsed_table[:,1])))
lines_op.set_color(color)
lines_op.set_linewidth(bar_width)
lines_cl = LineCollection( zip(zip((np.array(dates) + date_offset).tolist(), OHLCV[:,4]), zip((np.array(dates)).tolist(), parsed_table[:,4])))
lines_cl.set_color(color)
lines_cl.set_linewidth(bar_width)
ax.add_collection(lines_hl, autolim=True)
ax.add_collection(lines_cl, autolim=True)
ax.add_collection(lines_op, autolim=True)
ax.xaxis.set_major_locator(month)
ax.xaxis.set_major_formatter(timeFmt)
ax.xaxis.set_minor_locator(day)
ax.autoscale_view()
ax.xaxis.grid(True, 'major')
ax.grid(True)
ax.set_title('EOD test plot')
ax.set_xlabel('Date')
ax.set_ylabel('Price , $')
fig.savefig('test.png', dpi = 50, bbox_inches='tight')
close()
if __name__=='__main__':
data_table = urllib2.urlopen(r"http://ichart.finance.yahoo.com/table.csv?s=IBM&a=00&b=1&c=2012&d=00&e=15&f=2013&g=d&ignore=.csv").readlines()[1:][::-1]
parsed_table = []
#Format: Date, Open, High, Low, Close, Volume
dtype = (lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').date(),float, float, float, float, int)
for row in data_table:
field = row.strip().split(',')[:-1]
data_tmp = [i(j) for i,j in zip(dtype, field)]
parsed_table.append(data_tmp)
parsed_table = np.array(parsed_table)
import time
bf = time.time()
count = 100
for i in xrange(count):
test_plot(parsed_table)
print('Plot time: %s' %(time.time() - bf) / count)
The result is something like this. Average time execution on each plot is aproximately 2.6s. Charting in R is much faster, but I didn't measure the performance and I don't want use Rpy, so I bielive that my code is inefficient.
This solution reuses a Figure instance and saves plots asynchronously. You could change this to have as many figures as there are processors, do that many plots asynchronously, and it should speed things up even more. As it is, this takes ~1s per plot, down from 2.6 on my machine.
import numpy as np
import datetime
import urllib2
import time
import multiprocessing as mp
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from pylab import *
from matplotlib.collections import LineCollection
class AsyncPlotter():
def __init__(self, processes=mp.cpu_count()):
self.manager = mp.Manager()
self.nc = self.manager.Value('i', 0)
self.pids = []
self.processes = processes
def async_plotter(self, nc, fig, filename, processes):
while nc.value >= processes:
time.sleep(0.1)
nc.value += 1
print "Plotting " + filename
fig.savefig(filename)
plt.close(fig)
nc.value -= 1
def save(self, fig, filename):
p = mp.Process(target=self.async_plotter,
args=(self.nc, fig, filename, self.processes))
p.start()
self.pids.append(p)
def join(self):
for p in self.pids:
p.join()
class FinanceChart():
def __init__(self, async_plotter):
self.async_plotter = async_plotter
self.bar_width = 1.3
self.date_offset = 0.5
self.fig = plt.figure(figsize=(50, 20), facecolor='w')
self.ax = self.fig.add_subplot(1, 1, 1)
self.labels = self.ax.get_xmajorticklabels()
setp(self.labels, rotation=0)
line_hl = LineCollection(([[(734881,1), (734882,5), (734883,9), (734889,5)]]))
line_op = LineCollection(([[(734881,1), (734882,5), (734883,9), (734889,5)]]))
line_cl = LineCollection(([[(734881,1), (734882,5), (734883,9), (734889,5)]]))
self.lines_hl = self.ax.add_collection(line_hl, autolim=True)
self.lines_op = self.ax.add_collection(line_cl, autolim=True)
self.lines_cl = self.ax.add_collection(line_op, autolim=True)
self.ax.set_title('EOD test plot')
self.ax.set_xlabel('Date')
self.ax.set_ylabel('Price , $')
month = MonthLocator()
day = DayLocator()
timeFmt = DateFormatter('%Y-%m-%d')
self.ax.xaxis.set_major_locator(month)
self.ax.xaxis.set_major_formatter(timeFmt)
self.ax.xaxis.set_minor_locator(day)
def test_plot(self, OHLCV, i):
colormap = OHLCV[:,1] < OHLCV[:,4]
color = np.zeros(colormap.__len__(), dtype = np.dtype('|S5'))
color[:] = 'red'
color[np.where(colormap)] = 'green'
dates = date2num( OHLCV[:,0])
date_array = np.array(dates)
xmin = min(dates)
xmax = max(dates)
ymin = min(OHLCV[:,1])
ymax = max(OHLCV[:,1])
self.lines_hl.set_segments( zip(zip(dates, OHLCV[:,2]), zip(dates, OHLCV[:,3])))
self.lines_hl.set_color(color)
self.lines_hl.set_linewidth(self.bar_width)
self.lines_op.set_segments( zip(zip((date_array - self.date_offset).tolist(), OHLCV[:,1]), zip(date_array.tolist(), OHLCV[:,1])))
self.lines_op.set_color(color)
self.lines_op.set_linewidth(self.bar_width)
self.lines_cl.set_segments( zip(zip((date_array + self.date_offset).tolist(), OHLCV[:,4]), zip(date_array.tolist(), OHLCV[:,4])))
self.lines_cl.set_color(color)
self.lines_cl.set_linewidth(self.bar_width)
self.ax.set_xlim(xmin,xmax)
self.ax.set_ylim(ymin,ymax)
self.ax.xaxis.grid(True, 'major')
self.ax.grid(True)
self.async_plotter.save(self.fig, '%04i.png'%i)
if __name__=='__main__':
print "Starting"
data_table = urllib2.urlopen(r"http://ichart.finance.yahoo.com/table.csv?s=IBM&a=00&b=1&c=2012&d=00&e=15&f=2013&g=d&ignore=.csv").readlines()[1:][::-1]
parsed_table = []
#Format: Date, Open, High, Low, Close, Volume
dtype = (lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').date(),float, float, float, float, int)
for row in data_table:
field = row.strip().split(',')[:-1]
data_tmp = [i(j) for i,j in zip(dtype, field)]
parsed_table.append(data_tmp)
parsed_table = np.array(parsed_table)
import time
bf = time.time()
count = 10
a = AsyncPlotter()
_chart = FinanceChart(a)
print "Done with startup tasks"
for i in xrange(count):
_chart.test_plot(parsed_table, i)
a.join()
print('Plot time: %.2f' %(float(time.time() - bf) / float(count)))
Related
how can I add a linar regression to this bokeh?, I have trouble with this, and dont know how to add to the figure the lr (don't know how to add to the curdoc expression). I've seen other posts, but havent found the way to add it to the bokeh. Please, help me with this showing how to add that line to the figure.
import pandas as pd
from bokeh.layouts import column, row
from bokeh.models import Select
from bokeh.palettes import Spectral5
from bokeh.plotting import curdoc, figure
from bokeh.sampledata.autompg import autompg_clean as df
df = df.copy()
SIZES = list(range(6, 22, 3))
COLORS = Spectral5
N_SIZES = len(SIZES)
N_COLORS = len(COLORS)
# data cleanup
df.cyl = df.cyl.astype(str)
df.yr = df.yr.astype(str)
del df['name']
columns = sorted(df.columns)
discrete = [x for x in columns if df[x].dtype == object]
continuous = [x for x in columns if x not in discrete]
def create_figure():
xs = df[x.value].values
ys = df[y.value].values
x_title = x.value.title()
y_title = y.value.title()
kw = dict()
if x.value in discrete:
kw['x_range'] = sorted(set(xs))
if y.value in discrete:
kw['y_range'] = sorted(set(ys))
kw['title'] = "%s vs %s" % (x_title, y_title)
p = figure(height=600, width=800, tools='pan,box_zoom,hover,reset', **kw)
p.xaxis.axis_label = x_title
p.yaxis.axis_label = y_title
if x.value in discrete:
p.xaxis.major_label_orientation = pd.np.pi / 4
sz = 9
if size.value != 'None':
if len(set(df[size.value])) > N_SIZES:
groups = pd.qcut(df[size.value].values, N_SIZES, duplicates='drop')
else:
groups = pd.Categorical(df[size.value])
sz = [SIZES[xx] for xx in groups.codes]
c = "#31AADE"
if color.value != 'None':
if len(set(df[color.value])) > N_COLORS:
groups = pd.qcut(df[color.value].values, N_COLORS, duplicates='drop')
else:
groups = pd.Categorical(df[color.value])
c = [COLORS[xx] for xx in groups.codes]
p.circle(x=xs, y=ys, color=c, size=sz, line_color="white", alpha=0.6, hover_color='white', hover_alpha=0.5)
return p
def update(attr, old, new):
layout.children[1] = create_figure()
x = Select(title='X-Axis', value='mpg', options=columns)
x.on_change('value', update)
y = Select(title='Y-Axis', value='hp', options=columns)
y.on_change('value', update)
size = Select(title='Size', value='None', options=['None'] + continuous)
size.on_change('value', update)
color = Select(title='Color', value='None', options=['None'] + continuous)
color.on_change('value', update)
controls = column(x, y, color, size, width=200)
layout = row(controls, create_figure())
curdoc().add_root(layout)
curdoc().title = "Crossfilter"
I'm using a Raspberry Pi to plot live data from serial, but eventually run out of memory. I'm not sure if/how I can close the figure, but still have a live data display.
Would it be possible to create and close a new figure with every animate?
My code at the moment:
import serial
import matplotlib
# Force matplotlib to not use any Xwindows backend.
matplotlib.use('TkAgg') #comment out for debugging
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np
import gc
# Create figure for plotting
fig = plt.figure()
xs = []
ysAC = []
ysDC = []
ser = serial.Serial('/dev/ttyUSB0', 115200, timeout=1)
ser.flush()
# This function is called periodically from FuncAnimation
def animate(i, xs, ysAC, ysDC):
values = getValues()
wAC = values[1]
wDC = values[2]
# Add x and y to lists
xs.append(i)
ysAC.append(wAC)
ysDC.append(wDC)
# Limit x and y lists to 10 items
xs = ['T-9','T-8','T-7','T-6','T-5','T-4','T-3','T-2','T-1','Now']
ysDC = ysDC[-10:]
ysAC = ysAC[-10:]
# Draw x and y lists
axRT1.clear()
if len(ysDC) == 10:
lineAC, = axRT1.plot(xs, ysAC, 'b:', label='Mains', linewidth = 4)
lineDC, = axRT1.plot(xs, ysDC, 'g--', label='Solar', linewidth = 4)
gc.collect()
#fig.clf()
#plt.close()
def getValues():
if ser.in_waiting > 0:
line = ser.readline().decode('utf-8').rstrip()
return list(line.split(","))
# Set up plot to call animate() function periodically
ani = animation.FuncAnimation(fig, animate, fargs=(xs, ysAC, ysDC), interval=1000, blit=False)
plt.get_current_fig_manager().full_screen_toggle()
plt.ioff()
plt.show()
plt.draw()
The crude way of clearing the plots marked below fixed it for me:
import time
import serial
import datetime as dt
import matplotlib
# Force matplotlib to not use any Xwindows backend.
matplotlib.use('TkAgg') #comment out for debugging
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
import matplotlib.animation as animation
from decimal import Decimal
import pandas as pd
import numpy as np
import os.path as path
import re
import gc
import os
count = 0
# Create figure for plotting
fig = plt.figure()
fig.patch.set_facecolor('whitesmoke')
hFont = {'fontname':'sans-serif', 'weight':'bold', 'size':'12'}
xs = ['T-9','T-8','T-7','T-6','T-5','T-4','T-3','T-2','T-1','Now']
ysTemp = []
ysAC = []
ysDC = []
axRT1 = fig.add_subplot(2, 2, 1)
axRT2 = axRT1.twinx() # instantiate a second axes that shares the same x-axis
#Draw x and y lists
axRT1.clear()
axRT2.clear()
axRT1.set_ylim([0, 4])
axRT2.set_ylim([10, 70])
axRT1.set_ylabel('Power Consumption kW', **hFont)
axRT2.set_ylabel('Temperature C', **hFont)
axRT1.set_xlabel('Seconds', **hFont)
axRT1.set_title('Power Consumption and Temperature - Real Time', **hFont)
lineTemp, = axRT2.plot([], [], 'r', label='Temp', linewidth = 4)
lineAC, = axRT1.plot([], [], 'b:', label='Mains', linewidth = 4)
lineDC, = axRT1.plot([], [], 'g--', label='Solar', linewidth = 4)
fig.legend([lineAC, lineDC,lineTemp], ['Mains', 'Solar', 'Temp'], fontsize=20)
ser = serial.Serial('/dev/ttyUSB0', 115200, timeout=1)
ser.flush()
# This function is called periodically from FuncAnimation
def animate(i, xs, ysTemp, ysAC, ysDC):
values = getValues()
if values != 0:
temp_c = Decimal(re.search(r'\d+',values[3]).group())
if temp_c < 0: temp_c = 0
wAC = round(Decimal(re.search(r'\d+', values[1]).group())/1000, 2)
if wAC < 0.35: wAC = 0
aDC = float(re.search(r'\d+', values[2]).group()) #remove characters
vDC = float(re.search(r'\d+', values[4][:5]).group()) #remove characters
wDC = aDC * vDC
wDC = round(abs(Decimal(wDC))/1000, 2)
# Add x and y to lists
ysTemp.append(temp_c)
ysAC.append(wAC)
ysDC.append(wDC)
# Limit x and y lists to 10 items
ysTemp = ysTemp[-10:]
ysDC = ysDC[-10:]
ysAC = ysAC[-10:]
if len(ysTemp) == 10:
axRT2.lines = [] #This crude way of clearing the plots worked
axRT1.lines =[] #This crude way of clearing the plots worked
lineTemp, = axRT2.plot(xs, ysTemp, 'r', label='Temp', linewidth = 4)
lineAC, = axRT1.plot(xs, ysAC, 'b:', label='Mains', linewidth = 4)
lineDC, = axRT1.plot(xs, ysDC, 'g--', label='Solar', linewidth = 4)
def getValues():
measureList = 0
if ser.in_waiting > 0:
line = ser.readline().decode('utf-8').rstrip()
print(line)
if line.count(',') == 4:
measureList = list(line.split(","))
return measureList
# Set up plot to call animate() function periodically
ani = animation.FuncAnimation(fig, animate, fargs=(xs, ysTemp, ysAC, ysDC), interval=1000, blit=False)
plt.get_current_fig_manager().full_screen_toggle()
plt.ioff()
plt.show()
plt.draw()
I've got this code that is supposed to make a heatmap, but with circles instead of squares/rectangles, so far testing it with placeholder colors, looks like this:
import matplotlib.pyplot as plt
import matplotlib.colors as mcl
import numpy as np
import pandas as pd
from typing import List, T
from random import uniform
def l_flatten(l: List[T]) -> List[T]:
return [j for i in l for j in i]
def get_luminance(color: str) -> float:
# taken from Seaborn's utils
rgb = mcl.colorConverter.to_rgba_array(color)[:, :3]
rgb = np.where(rgb <= .03928, rgb / 12.92, ((rgb + .055) / 1.055) ** 2.4)
lum = rgb.dot([.2126, .7152, .0722])
try:
lum = lum.item()
except ValueError:
pass
return lum
class CircleHeatmap:
def __init__(self,
ax: plt.Axes,
df: pd.DataFrame,
colors: List[str],
annot_show: bool,
annot_size: float,
circle_size: float,
x_labels: List[str],
x_labels_size: float,
x_labels_color: str,
y_labels: List[str],
y_labels_size: float,
y_labels_color: str) -> None:
# pass user-provided variables
self.ax = ax
self.df = df
self.colors = colors
self.annot_show = annot_show
self.annot_size = annot_size
self.circle_size = circle_size
self.x_labels = x_labels
self.x_labels_size = x_labels_size
self.x_labels_color = x_labels_color
self.y_labels = y_labels
self.y_labels_size = y_labels_size
self.y_labels_color = y_labels_color
# pass technical variables
self.y_size, self.x_size = self.df.shape
self.x_arr, self.y_arr = np.meshgrid(np.arange(self.x_size),
np.arange(self.y_size))
self.x_arr, self.y_arr = ((self.x_arr + 0.5).flat,
(self.y_arr + 0.5).flat)
self.x_len, self.y_len = [np.linspace(0, len(i), len(i) + 1)[:-1] + 0.5
for i in (self.x_labels, self.y_labels)]
self.df_values = l_flatten(self.df.values.tolist())
def plot(self) -> None:
self.ax.scatter(self.x_arr, self.y_arr,
s = self.circle_size ** 2,
c = self.colors)
def labels(self) -> None:
self.ax.set_xticks(self.x_len)
self.ax.set_yticks(self.y_len)
self.ax.set_xticklabels(self.x_labels, fontsize = self.x_labels_size,
color = self.x_labels_color)
self.ax.set_yticklabels(self.y_labels, fontsize = self.y_labels_size,
color = self.y_labels_color)
def main() -> None:
fig, ax = plt.subplots(figsize = (20, 30))
df = pd.DataFrame([[uniform(0, 1) for j in range(20)] for i in range(30)])
colors = ["#EC4E20", "#FF9505", "#016FB9"] * 200
heatmap = CircleHeatmap(ax = ax,
df = df,
colors = colors,
annot_show = False,
annot_size = 16,
circle_size = 45,
x_labels = [i for i in range(20)],
x_labels_size = 20,
x_labels_color = "black",
y_labels = [i for i in range(30)],
y_labels_size = 20,
y_labels_color = "black")
heatmap.plot()
heatmap.labels()
for i in ["top", "bottom", "right", "left"]:
ax.spines[i].set_visible(False)
plt.savefig("test2.png")
if __name__ == "__main__":
main()
As a result, I get something like this. My question is: how can I move the ticks and the labels on the x-axis up a little bit, preferrably with an option to control them with a variable?
I had a similar result, but only commented on the settings since you were so familiar with them. Once again, a fix was needed and I will respond with the corrected code. I don't know if I was able to add the code in the best place. The following code can help.
def labels(self) -> None:
self.ax.set_xticks(self.x_len)
self.ax.set_yticks(self.y_len)
self.ax.spines['bottom'].set_position(('data', 0))
self.ax.spines['left'].set_position(('data', 0))
self.ax.set_xticklabels(self.x_labels, fontsize = self.x_labels_size,
color = self.x_labels_color)
self.ax.set_yticklabels(self.y_labels, fontsize = self.y_labels_size,
color = self.y_labels_color)
I am trying to plot a histogram but can't seem to get it working.
My current code is using a line plot.
The code is below:
ticker = 'BGSF'
style.use('ggplot')
start_date = '01-01-2010'
end_date = '03-07-2021'
prices = pdr.DataReader(ticker, data_source='yahoo', start=start, end=end)['Close']
returns = prices.pct_change()
last_price = prices[-1]
number_of_simulations = 10000
num_days = 90
simulation_df = pd.DataFrame()
for x in range(number_of_simulations):
counter = 0
daily_vol = returns.std()
price_series = []
price = last_price * (1 + np.random.normal(0, daily_vol))
price_series.append(price)
for y in range(num_days):
if counter == 251:
break
price = price_series[counter] * (1 + np.random.normal(0, daily_vol))
price_series.append(price)
counter += 1
simulation_df[x] = price_series
fig = plt.figure()
fig.suptitle('Simulator')
plt.plot(simulation_df)
plt.axhline(y = last_price, color = 'r', linestyle = '-')
plt.xlabel('Day')
plt.ylabel('Price')
plt.show()
How can I change my code so that I get a histogram/distribution? Or something that will give a discernible visual representation of the data.
The data output looks like:
0 1 2 ... 9997 9998 9999
0 13.628622 13.239073 12.377603 ... 11.604061 13.289695 12.351764
1 13.286069 13.229105 11.802037 ... 10.922634 13.369048 11.687561
2 13.278381 12.754887 11.293223 ... 10.722178 14.019657 11.468026
3 13.518970 13.051310 11.697287 ... 10.973414 14.125174 11.326635
4 12.696852 13.037619 11.126289 ... 10.840734 14.397820 11.226199
You can do the following
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#generate some data
col0 = np.random.randn(1000)
col1 = np.random.randn(1000)+10
df_simulated = pd.DataFrame({"0":col0,"1":col1})
last_price = 300
fig,ax = plt.subplots(1,1)
fig.suptitle("Simulator")
plt.hist(df_simulated,axes=ax) #plot histogram on "ax"
plt.axhline(y = last_price, color = 'r', linestyle = '-',axes=ax)
ax.set_xlabel('Day')
ax.set_ylabel('Price')
My timing shows that k-means consistently loses out on timing, compared to a mixture model, initialized using k-means.
What's the explanation for this? Is the GMM using a different k-means algorithm? Am I misunderstanding how it works? Does it use a differently sized dataset (smaller than I'm drawing from?).
import sklearn.cluster
import sklearn.mixture
import numpy as np
import time
import matplotlib.pyplot as plt
k = 3
N = 100
def clust():
m = sklearn.cluster.KMeans(n_clusters = k)
m.fit(X.reshape(-1, 1))
return m.cluster_centers_
def fit():
m = sklearn.mixture.GaussianMixture(n_components = k, init_params = "kmeans")
m.fit(X.reshape(-1, 1))
return m.means_
duration_clust = []
duration_fit = []
ctrs_clust = []
ctrs_fit = []
for i in range(N):
_1 = np.random.normal(0.25, 0.15, 50)
_2 = np.random.normal(0.50, 0.15, 50)
_3 = np.random.normal(0.75, 0.15, 50)
X = np.concatenate((_1, _2, _3)).reshape(-1, 1)
ts = time.time()
c = clust()
te = time.time()
time_clust = (te - ts) * 1e3
ts = time.time()
f = fit()
te = time.time()
time_fit = (te - ts) * 1e3
duration_clust.append(time_clust)
duration_fit.append(time_fit)
ctrs_clust.append(c)
ctrs_fit.append(f)
bins0 = np.arange(0, 20, 1)
bins1 = np.linspace(0,1,30)
fig, ax = plt.subplots(nrows = 2)
ax[0].hist(duration_clust, label = "Kmeans", bins = bins0, alpha = 0.5)
ax[0].hist(duration_fit, label = "GMM with Kmeans", bins = bins0, alpha = 0.5)
ax[0].set_xlabel("duration (ms)")
ax[0].legend(loc = "upper right")
ax[1].hist(np.ravel(ctrs_clust), label = "Kmeans centers", bins = bins1, alpha = 0.5)
ax[1].hist(np.ravel(ctrs_fit), label = "GMM centers", bins = bins1, alpha = 0.5)
ax[1].set_xlabel("Center location")
ax[1].axvline([0.25], label = "Truth", color = "black")
ax[1].axvline([0.50], color = "black")
ax[1].axvline([0.75], color = "black")
ax[1].legend(loc = "upper right")
plt.tight_layout()
plt.show()