PYTHON: how can I add a linear regression to this bokeh? - python

how can I add a linar regression to this bokeh?, I have trouble with this, and dont know how to add to the figure the lr (don't know how to add to the curdoc expression). I've seen other posts, but havent found the way to add it to the bokeh. Please, help me with this showing how to add that line to the figure.
import pandas as pd
from bokeh.layouts import column, row
from bokeh.models import Select
from bokeh.palettes import Spectral5
from bokeh.plotting import curdoc, figure
from bokeh.sampledata.autompg import autompg_clean as df
df = df.copy()
SIZES = list(range(6, 22, 3))
COLORS = Spectral5
N_SIZES = len(SIZES)
N_COLORS = len(COLORS)
# data cleanup
df.cyl = df.cyl.astype(str)
df.yr = df.yr.astype(str)
del df['name']
columns = sorted(df.columns)
discrete = [x for x in columns if df[x].dtype == object]
continuous = [x for x in columns if x not in discrete]
def create_figure():
xs = df[x.value].values
ys = df[y.value].values
x_title = x.value.title()
y_title = y.value.title()
kw = dict()
if x.value in discrete:
kw['x_range'] = sorted(set(xs))
if y.value in discrete:
kw['y_range'] = sorted(set(ys))
kw['title'] = "%s vs %s" % (x_title, y_title)
p = figure(height=600, width=800, tools='pan,box_zoom,hover,reset', **kw)
p.xaxis.axis_label = x_title
p.yaxis.axis_label = y_title
if x.value in discrete:
p.xaxis.major_label_orientation = pd.np.pi / 4
sz = 9
if size.value != 'None':
if len(set(df[size.value])) > N_SIZES:
groups = pd.qcut(df[size.value].values, N_SIZES, duplicates='drop')
else:
groups = pd.Categorical(df[size.value])
sz = [SIZES[xx] for xx in groups.codes]
c = "#31AADE"
if color.value != 'None':
if len(set(df[color.value])) > N_COLORS:
groups = pd.qcut(df[color.value].values, N_COLORS, duplicates='drop')
else:
groups = pd.Categorical(df[color.value])
c = [COLORS[xx] for xx in groups.codes]
p.circle(x=xs, y=ys, color=c, size=sz, line_color="white", alpha=0.6, hover_color='white', hover_alpha=0.5)
return p
def update(attr, old, new):
layout.children[1] = create_figure()
x = Select(title='X-Axis', value='mpg', options=columns)
x.on_change('value', update)
y = Select(title='Y-Axis', value='hp', options=columns)
y.on_change('value', update)
size = Select(title='Size', value='None', options=['None'] + continuous)
size.on_change('value', update)
color = Select(title='Color', value='None', options=['None'] + continuous)
color.on_change('value', update)
controls = column(x, y, color, size, width=200)
layout = row(controls, create_figure())
curdoc().add_root(layout)
curdoc().title = "Crossfilter"

Related

Bokeh: update map based on dates filtered via DateSlider

This is my code:
import pandas as pd
import numpy as np
from bokeh.models import *
from bokeh.plotting import *
from bokeh.io import *
from bokeh.tile_providers import *
from bokeh.palettes import *
from bokeh.transform import *
from bokeh.layouts import *
radius_scale = 100
df = pd.DataFrame({'date': ['2009-01-01', '2009-01-02', '2009-01-03', '2009-01-04', '2009-01-05', '2009-01-01', '2009-01-02', '2009-01-03', '2009-01-04', '2009-01-05','2009-01-01', '2009-01-02', '2009-01-03', '2009-01-04', '2009-01-05'],
'state': ['Melaka', 'Melaka', 'Melaka', 'Melaka', 'Melaka', 'Perak', 'Perak', 'Perak', 'Perak', 'Perak', 'Kuala Lumpur', 'Kuala Lumpur', 'Kuala Lumpur', 'Kuala Lumpur', 'Kuala Lumpur'],
'tourists': [100, 121, 235, 197, 390, 57, 49, 81, 73, 183, 351, 490, 618, 438, 557]})
df['longitude'] = df['state'].map({'Melaka': 102.2464615, 'Perak': 101.0314453, 'Kuala Lumpur': 101.6869,})
df['latitude'] = df['state'].map({'Melaka': 2.206414407, 'Perak': 4.01185976, 'Kuala Lumpur': 3.1390,})
df['date'] = df['date'].astype('datetime64[ns]')
df['tourists_plot'] = df['tourists'] * radius_scale
# Mercator units conversion
def wgs84_to_web_mercator(df, lon, lat):
"""Converts decimal longitude/latitude to Web Mercator format"""
k = 6378137
df["x"] = df[lon] * (k * np.pi/180.0)
df["y"] = np.log(np.tan((90 + df[lat]) * np.pi/360.0)) * k
return df
df = wgs84_to_web_mercator(df, 'longitude', 'latitude')
source = ColumnDataSource(df)
# Map zoom scale
scale = 2500
x = df['x']
y = df['y']
# Centers map
x_min = int(x.mean() - (scale * 1))
x_max = int(x.mean() + (scale * 1))
y_min = int(y.mean() - (scale * 350))
y_max = int(y.mean() + (scale * 350))
# Prepare Bokeh
plot = figure(
title = 'Malaysia Tourism',
match_aspect = True,
tools = 'wheel_zoom, pan, reset, save',
x_range = (x_min, x_max), y_range = (y_min, y_max),
x_axis_type = 'mercator', y_axis_type = 'mercator',
width = 900
)
plot.grid.visible = True
# Get map
map = plot.add_tile(get_provider(OSM))
map.level = 'underlay'
plot.xaxis.visible = False
plot.yaxis.visible = False
plot.title.text_font_size = "20px"
def bubble_map(plot, df_source, radius_col_plot, radius_col, state, color='orange', leg_label='Bubble Map'):
source = df_source
c = plot.circle(x = 'x', y = 'y', color = color, source = source, size = 1, fill_alpha = 0.4, radius = radius_col_plot,\
legend_label = leg_label, hover_color = 'red')
tip_label = '#' + radius_col
state_label = '#' + state
circle_hover = HoverTool(tooltips = [("Percentage " + leg_label, tip_label + "%"), ('State', state_label)], mode = 'mouse',\
point_policy = 'follow_mouse', renderers = [c])
circle_hover.renderers.append(c)
plot.tools.append(circle_hover)
plot.legend.location = "top_right"
plot.legend.click_policy = "hide"
bubble_map(plot = plot,
df_source = source, radius_col_plot = 'tourists_plot', radius_col = 'tourists', leg_label = 'Tourists',
state = 'state', color = 'blue')
date_slider = DateSlider(start = min(df.date), end = max(df.date), value = min(df.date), step = 1, title = "Date")
def update_plot(attr, old, new):
datesel = new
new_data = df[df['date'] == datesel]
source.data.update(ColumnDataSource(new_data).data)
date_slider.on_change('value', update_plot)
# show(column(date_slider, plot))
curdoc().add_root(column(date_slider, plot))
In short, what I'm trying to plot is the growth of df['tourists'] of each state on map by each day. I am trying to enable data filtering via DateSlider widget date_slider which filters rows based on selected date
However the slider isn't working for me. I am not getting any errors either, in which I'm in lost of what to debug
This is what I got before selection, where all 5 days data are cluttered together:
Not like the slider helps either, all points disappeared upon sliding:
Please advise
DateSlider returns a timestamp in milliseconds, which you are using to filter the new dataframe. As the new date is not in the form of YYYY-MM-DD, your dataframe comes out empty.
Convert the timestamp received from DateSlider and it should work:
def update_plot(attr, old, new):
datesel = datetime.fromtimestamp(new / 1000).strftime('%Y-%m-%d')
new_data = df[df['date'] == datesel]
source.data.update(ColumnDataSource(new_data).data)
Since the timestamp is in milliseconds and datetime expects seconds, it has to be divided by 1000 to convert to seconds.
Another suggestion is also to change step from 1 to 86400000 to be able to slide select easier between dates (milliseconds in a day).
date_slider = DateSlider(start = min(df.date), end = max(df.date), value = min(df.date), step = 86400000, title = "Date")

Filling shapefile polygons partially in basemap

I would like to fill a polygon using basemap/shapefile data, but only a certain %. For example, in the example below, we fill based on the values, but let's say I wanted to fill a % of the polygon based on these values (code from here):
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
import numpy as np
fig= plt.figure()
ax= fig.add_subplot(111)
m=Basemap(projection='cyl',llcrnrlat=34.5,llcrnrlon=19,
urcrnrlat=42,urcrnrlon=28.5,resolution='h')
m.drawmapboundary(fill_color='aqua')
m.fillcontinents(color='w',lake_color='aqua')
m.drawcoastlines()
m.readshapefile('data/nomoi/nomoi','nomoi')
dict1={14464: 1.16, 14465: 1.35, 14466: 1.28, 14467: 1.69, 14468: 1.81, 14418: 1.38}
colvals = dict1.values()
cmap=plt.cm.RdYlBu
norm=plt.Normalize(min(colvals),max(colvals))
patches = []
for info, shape in zip(m.nomoi_info, m.nomoi):
if info['ID_2'] in list(dict1.keys()):
color=cmap(norm(dict1[info['ID_2']]))
patches.append( Polygon(np.array(shape), True, color=color) )
pc = PatchCollection(patches, match_original=True, edgecolor='k', linewidths=1., zorder=2)
ax.add_collection(pc)
#colorbar
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array(colvals)
fig.colorbar(sm, ax=ax)
plt.show()
Thank you.
import math
from shapely.geometry import Polygon as shpoly
#shapefile of main massachusetts shape
iowpoly = state_shapes['Massachusetts'][32]
def return_xy(coords):
return [np.asarray([i[0] for i in coords]), np.asarray([i[1] for i in coords])]
def return_area(coords):
x, y = return_xy(coords)
return 0.5*np.abs(np.dot(x,np.roll(y,1))-np.dot(y,np.roll(x,1)))
def return_bounding_box(coords):
x, y = return_xy(coords)
return [[min(x), min(y)], [max(x), max(y)]]
def split_x_wise(bbox, weights, split = 2):
lleft = bbox[0]
uright = bbox[1]
dx = abs(uright[0] - lleft[0])
weights = np.cumsum(sorted(weights, reverse=True))
xcoords = [lleft[0]+weights[x-1]*dx for x in range(1, split)]
return xcoords
def generate_splits_by_area(coords, bbox, weights, tolerance = 0.03, div = 100):
xareasplits = {}
weights = np.cumsum(sorted(weights, reverse=True))[:-1]
lleft = bbox[0]
uright = bbox[1]
dx = abs(uright[0] - lleft[0])
xsplit = [lleft[0]+(dx/div)*x for x in range(1, div)]
for w in weights:
xareasplits[str(w)] = None
mainarea = shpoly(coords).area
for i, s in enumerate(xsplit):
poly = []
if i == 0:
continue
for ip, p in enumerate(coords):
if p[0] < s:
poly.append(p)
shpl = shpoly(poly).area
frac = shpl/mainarea
for w in weights:
if abs(w-frac) <= tolerance:
if xareasplits[str(w)] == None:
xareasplits[str(w)] = s
return list(xareasplits.values())
def return_split(coords, weights, split = 2, by_area = False, tolerance = 0.03, div = 100):
polys = {}
for x in range(0, split):
polys[str(x+1)] = {'points':[], 'maxit' : None}
bbox = return_bounding_box(coords)
if not by_area:
xsplit = split_x_wise(bbox, weights, split)
#test = generate_splits_by_area(coords, bbox, weights, tolerance=tolerance, div=div)
else:
xsplit = generate_splits_by_area(coords, bbox, weights, tolerance=tolerance, div=div)
xsplit.append(bbox[0][0])
xsplit.append(bbox[1][0])
xsplit = sorted(xsplit)
#print(xsplit)
#print(test)
for ip, p in enumerate(coords):
for i, splt in enumerate(xsplit):
if i > 0:
if (p[0] > xsplit[i-1]) & (p[0] < splt):
if len(polys[str(i)]['points']) == 0:
polys[str(i)]['points'].append(coords[ip-1])
polys[str(i)]['points'].append(p)
polys[str(i)]['maxit'] = ip
for poly, data in polys.items():
tmaxit = data['maxit']+1
if tmaxit >= len(coords):
data['points'].append(coords[0])
else:
data['points'].append(coords[tmaxit])
return polys
#return [p for p in coords if p[0] > xsplit[0]]
#bboxiowa = return_bounding_box(iowpoly)
splitpoly = return_split(iowpoly, weights = [0.2780539772727273, 0.1953716856060606, 0.19513494318181818, 0.18329782196969696, 0.14814157196969696],by_area = True,split = 5)
for k, v in splitpoly.items():
print (k, len(v['points']))
print (v['maxit'])
test = shpoly(splitpoly["1"]['points'])
test
I managed to write my own code to split and fill shapel polygons from shapefiles. The above code example splits the Massachusetts shapefile into 5 segments, weighted according to weights and by area.
The first 2 parts of the split look like this:

How to increase planes' size in Plotly

Got the following code
import pandas as pd
import plotly.graph_objects as go
import numpy as np
df = pd.read_csv('https://raw.githubusercontent.com/tiago-peres/immersion/master/Platforms_dataset.csv')
fig = px.scatter_3d(df, x='Functionality ', y='Accessibility', z='Immersion', color='Platforms')
grey = [[0,'#C0C0C0'],[1,'#C0C0C0']]
zero_pt = pd.Series([0])
z = zero_pt.append(df['Immersion'], ignore_index = True).reset_index(drop = True)
y = zero_pt.append(df['Accessibility'], ignore_index = True).reset_index(drop = True)
x = zero_pt.append(df['Functionality '], ignore_index = True).reset_index(drop = True)
length_data = len(z)
z_plane_pos = 66.5*np.ones((length_data,length_data))
fig.add_trace(go.Surface(x=x, y=y, z=z_plane_pos, colorscale=grey, showscale=False))
fig.add_trace(go.Surface(x=x.apply(lambda x: 15.69), y=y, z = np.array([z]*length_data), colorscale= grey, showscale=False))
fig.add_trace(go.Surface(x=x, y= y.apply(lambda x: 55), z = np.array([z]*length_data).transpose(), colorscale=grey, showscale=False))
fig.update_layout(scene = dict(
xaxis = dict(nticks=4, range=[0,31.38],),
yaxis = dict(nticks=4, range=[0,110],),
zaxis = dict(nticks=4, range=[0,133],),),
legend_orientation="h",margin=dict(l=0, r=0, b=0, t=0))
that can be opened in Google Colab which produces the following output
As you can see, the planes are not filling up the entire axis space, they should respect the axis range. In other words, the planes
z=66.5 - should exist between [0, 31.38] in x and [0, 110] in y
x=15.59 - should exist between [0, 110] in y and [0, 133] in z
y=55 - should exist between [0, 31.38] in x and [0, 133] in z
How can that be done?
With this new adjustment,
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
import numpy as np
df = pd.read_csv('https://raw.githubusercontent.com/tiago-peres/immersion/master/Platforms_dataset.csv')
fig = px.scatter_3d(df, x='Functionality ', y='Accessibility', z='Immersion', color='Platforms')
grey = [[0,'#C0C0C0'],[1,'#C0C0C0']]
zero_pt = pd.Series([0])
z1 = np.arange(0, 134, 1)
print(z1)
y1 = np.arange(0, 111, 1)
print(z1)
x1 = np.arange(0, 32.38, 1)
print(z1)
z = zero_pt.append(df['Immersion'], ignore_index = True).reset_index(drop = True)
y = zero_pt.append(df['Accessibility'], ignore_index = True).reset_index(drop = True)
x = zero_pt.append(df['Functionality '], ignore_index = True).reset_index(drop = True)
print(zero_pt)
print(z)
test1 = pd.Series([133])
test = z.append(test1)
length_data1 = len(z1)
z_plane_pos = 66.5*np.ones((length_data1,length_data1))
length_data2 = len(y1)
y_plane_pos = 55*np.ones((length_data2,length_data2))
length_data3 = len(x1)
x_plane_pos = 15.69*np.ones((length_data3,length_data3))
fig.add_trace(go.Surface(x=x1, y=y1, z=z_plane_pos, colorscale=grey, showscale=False))
fig.add_trace(go.Surface(x=x.apply(lambda x: 15.69), y=y1, z = np.array([test]*length_data1), colorscale= grey, showscale=False))
fig.add_trace(go.Surface(x=x1, y= y.apply(lambda x: 55), z = np.array([test]*length_data1).transpose(), colorscale=grey, showscale=False))
fig.update_layout(scene = dict(
xaxis = dict(nticks=4, range=[0,31.38],),
yaxis = dict(nticks=4, range=[0,110],),
zaxis = dict(nticks=4, range=[0,133],),),
legend_orientation="h",margin=dict(l=0, r=0, b=0, t=0))
nearly having the job done but the planes x=15.59 and y=55 aren't going to the maximum 133 in Immersion
The issue was that the arrays I was plotting weren't the right shapes.
By properly splitting the bit where created the input arrays and the plotting, was able to discover this, and consequently made input arrays (for plotting) of the right size and appropriate content.
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
import numpy as np
df = pd.read_csv('https://raw.githubusercontent.com/tiago-peres/immersion/master/Platforms_dataset.csv')
zero_pt = pd.Series([0])
z1 = np.arange(0, 134, 1)
y1 = np.arange(0, 111, 1)
x1 = np.arange(0, 32.38, 1)
z = zero_pt.append(df['Immersion'], ignore_index = True).reset_index(drop = True)
y = zero_pt.append(df['Accessibility'], ignore_index = True).reset_index(drop = True)
x = zero_pt.append(df['Functionality '], ignore_index = True).reset_index(drop = True)
test1 = pd.Series([133])
test = z.append(test1)
length_data1 = len(z1)
z_plane_pos = 66.5*np.ones((length_data1,length_data1))
length_data2 = len(y1)
y_plane_pos = 55*np.ones((length_data2,length_data2))
length_data3 = len(x1)
x_plane_pos = 15.69*np.ones((length_data3,length_data3))
xvals = x.apply(lambda x: 15.69)
xvals2 = x1
yvals = y1
yvals2 = y.apply(lambda x: 55)
zvals = np.zeros((len(yvals), len(xvals)))
zvals[:, -1] = 133 # np.array([test]*length_data2)
zvals2 = np.zeros((len(yvals2), len(xvals2)))
zvals2[-1, :] = 133
fig = px.scatter_3d(df, x='Functionality ', y='Accessibility', z='Immersion', color='Platforms')
grey = [[0,'#C0C0C0'],[1,'#C0C0C0']]
fig.add_trace(go.Surface(x=x1, y=y1, z=z_plane_pos, colorscale=grey, showscale=False))
fig.add_trace(go.Surface(x=xvals, y=yvals, z = zvals, colorscale= grey, showscale=False))
fig.add_trace(go.Surface(x=xvals2, y=yvals2, z = zvals2, colorscale=grey, showscale=False))
fig.update_layout(scene = dict(
xaxis = dict(nticks=4, range=[0,31.38],),
yaxis = dict(nticks=4, range=[0,110],),
zaxis = dict(nticks=4, range=[0,133],),),
legend_orientation="h",margin=dict(l=0, r=0, b=0, t=0))
See it here.

Splittig data in python dataframe and getting the array values automatically

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv('D:\ history/segment.csv')
data = pd.DataFrame(data)
data = data.sort_values(['Prob_score'], ascending=[False])
one = len(data)
actualpaid_overall = len(data.loc[data['paidstatus'] == 1])
data_split = np.array_split(data, 10)
data1 = data_split[0]
actualpaid_ten = len(data1.loc[data1['paidstatus'] == 1])
percent_ten = actualpaid_ten/actualpaid_overall
data2 = data_split[1]
actualpaid_twenty = len(data2.loc[data2['paidstatus'] == 1])
percent_twenty = (actualpaid_twenty/actualpaid_overall) + percent_ten
data3 = data_split[2]
actualpaid_thirty = len(data3.loc[data3['paidstatus'] == 1])
percent_thirty = (actualpaid_thirty/actualpaid_overall) + percent_twenty
data4 = data_split[3]
actualpaid_forty = len(data4.loc[data4['paidstatus'] == 1])
percent_forty = (actualpaid_forty/actualpaid_overall) + percent_thirty
data5 = data_split[4]
actualpaid_fifty = len(data5.loc[data5['paidstatus'] == 1])
percent_fifty = (actualpaid_fifty/actualpaid_overall) + percent_forty
data6 = data_split[5]
actualpaid_sixty = len(data6.loc[data6['paidstatus'] == 1])
percent_sixty = (actualpaid_sixty/actualpaid_overall) + percent_fifty
data7 = data_split[6]
actualpaid_seventy = len(data7.loc[data7['paidstatus'] == 1])
percent_seventy = (actualpaid_seventy/actualpaid_overall) + percent_sixty
data8 = data_split[7]
actualpaid_eighty = len(data8.loc[data8['paidstatus'] == 1])
percent_eighty = (actualpaid_eighty/actualpaid_overall) + percent_seventy
data9 = data_split[8]
actualpaid_ninenty = len(data9.loc[data9['paidstatus'] == 1])
percent_ninenty = (actualpaid_ninenty/actualpaid_overall) + percent_eighty
data10 = data_split[9]
actualpaid_hundred = len(data10.loc[data10['paidstatus'] == 1])
percent_hundred = (actualpaid_hundred/actualpaid_overall) + percent_ninenty
array_x = [10,20,30,40,50,60,70,80,90,100]
array_y = [ percent_ten, percent_twenty, percent_thirty, percent_forty,percent_fifty, percent_sixty, percent_seventy, percent_eighty, percent_ninenty, percent_hundred]
plt.xlabel(' Base')
plt.ylabel(' percent')
ax = plt.plot(array_x,array_y)
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth=0.5, color='0.1')
plt.grid( which='both', axis = 'both', linewidth=0.5,color='0.75')
The above is my python code i have splitted my dataframe into 10 equal sections and plotted the graph but I'm not satisfied with this i have two concerns:
array_x = [10,20,30,40,50,60,70,80,90,100] in this line of code i have manually taken the x values, is there any possible way to process automatically as i have taken split(data,10) it should show 10 array values
As we can see the whole data1,2,3,4...10 is being repeated again and again is there a solution to write this in a function or loop.
Any help with codes will be appreciated. Thanks
I believe you need list comprehension and for count is possible use simplier way - sum of boolean mask, True values are processes like 1, then convert list to numpy array and use numpy.cumsum:
data = pd.read_csv('D:\ history/segment.csv')
data = data.sort_values('Prob_score', ascending=False)
one = len(data)
actualpaid_overall = (data['paidstatus'] == 1).sum()
data_split = np.array_split(data, 10)
x = [len(x) for x in data_split]
y = [(x['paidstatus'] == 1).sum()/actualpaid_overall for x in data_split]
array_x = np.cumsum(np.array(x))
array_y = np.cumsum(np.array(y))
plt.xlabel(' Base')
plt.ylabel(' percent')
ax = plt.plot(array_x,array_y)
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth=0.5, color='0.1')
plt.grid( which='both', axis = 'both', linewidth=0.5,color='0.75')
Sample:
np.random.seed(2019)
N = 1000
data = pd.DataFrame({'paidstatus':np.random.randint(3, size=N),
'Prob_score':np.random.randint(100, size=N)})
#print (data)
data = data.sort_values(['Prob_score'], ascending=[False])
actualpaid_overall = (data['paidstatus'] == 1).sum()
data_split = np.array_split(data, 10)
x = [len(x) for x in data_split]
y = [(x['paidstatus'] == 1).sum()/actualpaid_overall for x in data_split]
array_x = np.cumsum(np.array(x))
array_y = np.cumsum(np.array(y))
print (array_x)
[ 100 200 300 400 500 600 700 800 900 1000]
print (array_y)
[0.09118541 0.18844985 0.27963526 0.38601824 0.49848024 0.61702128
0.72036474 0.81155015 0.9331307 1. ]

Very slow plot with Matlpotlib

Can anybody help how to optimize the plot function in python? I use Matplotlib to plot financial data.Here small function for plotting OHLC data. The time increase significantly if I add indicators or other data.
import numpy as np
import datetime
from matplotlib.collections import LineCollection
from pylab import *
import urllib2
def test_plot(OHLCV):
bar_width = 1.3
date_offset = 0.5
fig = figure(figsize=(50, 20), facecolor='w')
ax = fig.add_subplot(1, 1, 1)
labels = ax.get_xmajorticklabels()
setp(labels, rotation=0)
month = MonthLocator()
day = DayLocator()
timeFmt = DateFormatter('%Y-%m-%d')
colormap = OHLCV[:,1] < OHLCV[:,4]
color = np.zeros(colormap.__len__(), dtype = np.dtype('|S5'))
color[:] = 'red'
color[np.where(colormap)] = 'green'
dates = date2num( OHLCV[:,0])
lines_hl = LineCollection( zip(zip(dates, OHLCV[:,2]), zip(dates, OHLCV[:,3])))
lines_hl.set_color(color)
lines_hl.set_linewidth(bar_width)
lines_op = LineCollection( zip(zip((np.array(dates) - date_offset).tolist(), OHLCV[:,1]), zip((np.array(dates)).tolist(), parsed_table[:,1])))
lines_op.set_color(color)
lines_op.set_linewidth(bar_width)
lines_cl = LineCollection( zip(zip((np.array(dates) + date_offset).tolist(), OHLCV[:,4]), zip((np.array(dates)).tolist(), parsed_table[:,4])))
lines_cl.set_color(color)
lines_cl.set_linewidth(bar_width)
ax.add_collection(lines_hl, autolim=True)
ax.add_collection(lines_cl, autolim=True)
ax.add_collection(lines_op, autolim=True)
ax.xaxis.set_major_locator(month)
ax.xaxis.set_major_formatter(timeFmt)
ax.xaxis.set_minor_locator(day)
ax.autoscale_view()
ax.xaxis.grid(True, 'major')
ax.grid(True)
ax.set_title('EOD test plot')
ax.set_xlabel('Date')
ax.set_ylabel('Price , $')
fig.savefig('test.png', dpi = 50, bbox_inches='tight')
close()
if __name__=='__main__':
data_table = urllib2.urlopen(r"http://ichart.finance.yahoo.com/table.csv?s=IBM&a=00&b=1&c=2012&d=00&e=15&f=2013&g=d&ignore=.csv").readlines()[1:][::-1]
parsed_table = []
#Format: Date, Open, High, Low, Close, Volume
dtype = (lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').date(),float, float, float, float, int)
for row in data_table:
field = row.strip().split(',')[:-1]
data_tmp = [i(j) for i,j in zip(dtype, field)]
parsed_table.append(data_tmp)
parsed_table = np.array(parsed_table)
import time
bf = time.time()
count = 100
for i in xrange(count):
test_plot(parsed_table)
print('Plot time: %s' %(time.time() - bf) / count)
The result is something like this. Average time execution on each plot is aproximately 2.6s. Charting in R is much faster, but I didn't measure the performance and I don't want use Rpy, so I bielive that my code is inefficient.
This solution reuses a Figure instance and saves plots asynchronously. You could change this to have as many figures as there are processors, do that many plots asynchronously, and it should speed things up even more. As it is, this takes ~1s per plot, down from 2.6 on my machine.
import numpy as np
import datetime
import urllib2
import time
import multiprocessing as mp
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from pylab import *
from matplotlib.collections import LineCollection
class AsyncPlotter():
def __init__(self, processes=mp.cpu_count()):
self.manager = mp.Manager()
self.nc = self.manager.Value('i', 0)
self.pids = []
self.processes = processes
def async_plotter(self, nc, fig, filename, processes):
while nc.value >= processes:
time.sleep(0.1)
nc.value += 1
print "Plotting " + filename
fig.savefig(filename)
plt.close(fig)
nc.value -= 1
def save(self, fig, filename):
p = mp.Process(target=self.async_plotter,
args=(self.nc, fig, filename, self.processes))
p.start()
self.pids.append(p)
def join(self):
for p in self.pids:
p.join()
class FinanceChart():
def __init__(self, async_plotter):
self.async_plotter = async_plotter
self.bar_width = 1.3
self.date_offset = 0.5
self.fig = plt.figure(figsize=(50, 20), facecolor='w')
self.ax = self.fig.add_subplot(1, 1, 1)
self.labels = self.ax.get_xmajorticklabels()
setp(self.labels, rotation=0)
line_hl = LineCollection(([[(734881,1), (734882,5), (734883,9), (734889,5)]]))
line_op = LineCollection(([[(734881,1), (734882,5), (734883,9), (734889,5)]]))
line_cl = LineCollection(([[(734881,1), (734882,5), (734883,9), (734889,5)]]))
self.lines_hl = self.ax.add_collection(line_hl, autolim=True)
self.lines_op = self.ax.add_collection(line_cl, autolim=True)
self.lines_cl = self.ax.add_collection(line_op, autolim=True)
self.ax.set_title('EOD test plot')
self.ax.set_xlabel('Date')
self.ax.set_ylabel('Price , $')
month = MonthLocator()
day = DayLocator()
timeFmt = DateFormatter('%Y-%m-%d')
self.ax.xaxis.set_major_locator(month)
self.ax.xaxis.set_major_formatter(timeFmt)
self.ax.xaxis.set_minor_locator(day)
def test_plot(self, OHLCV, i):
colormap = OHLCV[:,1] < OHLCV[:,4]
color = np.zeros(colormap.__len__(), dtype = np.dtype('|S5'))
color[:] = 'red'
color[np.where(colormap)] = 'green'
dates = date2num( OHLCV[:,0])
date_array = np.array(dates)
xmin = min(dates)
xmax = max(dates)
ymin = min(OHLCV[:,1])
ymax = max(OHLCV[:,1])
self.lines_hl.set_segments( zip(zip(dates, OHLCV[:,2]), zip(dates, OHLCV[:,3])))
self.lines_hl.set_color(color)
self.lines_hl.set_linewidth(self.bar_width)
self.lines_op.set_segments( zip(zip((date_array - self.date_offset).tolist(), OHLCV[:,1]), zip(date_array.tolist(), OHLCV[:,1])))
self.lines_op.set_color(color)
self.lines_op.set_linewidth(self.bar_width)
self.lines_cl.set_segments( zip(zip((date_array + self.date_offset).tolist(), OHLCV[:,4]), zip(date_array.tolist(), OHLCV[:,4])))
self.lines_cl.set_color(color)
self.lines_cl.set_linewidth(self.bar_width)
self.ax.set_xlim(xmin,xmax)
self.ax.set_ylim(ymin,ymax)
self.ax.xaxis.grid(True, 'major')
self.ax.grid(True)
self.async_plotter.save(self.fig, '%04i.png'%i)
if __name__=='__main__':
print "Starting"
data_table = urllib2.urlopen(r"http://ichart.finance.yahoo.com/table.csv?s=IBM&a=00&b=1&c=2012&d=00&e=15&f=2013&g=d&ignore=.csv").readlines()[1:][::-1]
parsed_table = []
#Format: Date, Open, High, Low, Close, Volume
dtype = (lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').date(),float, float, float, float, int)
for row in data_table:
field = row.strip().split(',')[:-1]
data_tmp = [i(j) for i,j in zip(dtype, field)]
parsed_table.append(data_tmp)
parsed_table = np.array(parsed_table)
import time
bf = time.time()
count = 10
a = AsyncPlotter()
_chart = FinanceChart(a)
print "Done with startup tasks"
for i in xrange(count):
_chart.test_plot(parsed_table, i)
a.join()
print('Plot time: %.2f' %(float(time.time() - bf) / float(count)))

Categories

Resources