I'm trying to generate a faceted boxplot for linear model results, with treatments on the x axis. A conventional way to show significance is to append asterisks.
I'm finding this surprisingly difficult to do in plotly.
Example code:
import numpy as np
import pandas as pd
# Data
n = 10
conditiona = ['left', 'right']
conditionb = ['top', 'middle', 'bottom']
N = n * len(conditiona) * len(conditionb)
trt = np.repeat(['a', 'b','c'], N)
eff = np.repeat([3, 2, 1], N)
noise = np.random.normal(size = 3* N, loc = 0, scale = 1)
pval = np.repeat(['**', '', ''], N)
col = np.tile( np.repeat( conditiona, n * len(conditionb)), 3)
row = np.tile( np.repeat( conditionb, n) , len(conditiona) *3)
df = pd.DataFrame( { 'y' : noise + eff, 'trt' : trt, 'p' : pval, 'column' : col,
'row' : row})
## Plot
import plotly.graph_objects as go
from plotly.subplots import make_subplots
rows = df.row.unique().tolist()
cols = df.column.unique().tolist()
groups = df.trt.unique().tolist()
labs = [i + ' ' + j for j in rows for i in cols]
colors = ['red', 'green', 'blue']
fig = make_subplots(rows = len(rows), cols = len(cols),
shared_xaxes=True, subplot_titles = labs)
for group, dx in df.groupby(['row','column','trt']):
r = rows.index( group[0] ) + 1 # 1-based numbering
c = cols.index( group[1] ) + 1
name = str(group[2])
id = groups.index(group[2])
tr = go.Box( y = dx['y'], boxpoints = 'all', name = name ,marker_color = colors[id], text = dx['p'])
# tr2 = go.Scatter(x = 'x0', <- how do I get relative x coordinates of tr to put in here ?
# y = dx['y'].median(), text = dx['p'].unique())
fig.add_trace( tr, row = r, col = c )
fig.show()
[Desired] Output:
Is there an easy way to 'extract' the x coordinates of a box trace so I can overlay a marker?
Seems like this shouldn't be hard.
Figured it out eventually. You just have to know how plotly sets things up beforehand, apparently.
You can use annotations with xref and yref referencingthe subplots. The pattern of assignment is confusing (to me) and poorly documented.
y_refs increase sequentially from the bottom left, reading left to right. Thus in this figure bottom left panel is 'y1', bottom right is 'y2', middle left is 'y3' , middle right is 'y4' and so on.
ncol = len(cols)
fig = make_subplots(rows = len(rows), cols = len(cols),
shared_xaxes=True, subplot_titles = labs)
for group, dx in df.groupby(['row','column','trt']):
r = rows.index( group[0] ) + 1 # 1-based numbering
c = cols.index( group[1] ) + 1
name = str(group[2])
id = groups.index(group[2])
tr = go.Box( y = dx['y'], boxpoints = 'all', name = name ,marker_color = colors[id], text = dx['p'])
fig.add_trace( tr, row = r, col = c )
xref = 'x' + str(c)
yref = 'y' + str( (r-1)*ncol + c ) # yrefs added in strange pattern
fig.add_annotation(x = name,
y = dx.y.median(),
text = dx.p.unique()[0],
ax = 0, ay = 0,showarrow = False,
xref = xref, yref = yref,
font= dict(size = 24))
fig.show()
Here is a 2D example of what I want to achieve in 3D:
I have an array of values, A, s.t. A.shape=(n,m), e.g.
>>> A = [[1, 2],
... [3, 4]]
whose indexes are proportional to equally spaced steps along (arbitrary) basis vectors, e.g.
>>> v1 = [1,0]
>>> v2 = [cos(pi/4),sin(pi/4)] # [0,1] rotated 45 degrees
I want a function which applies this basis to get, for this example
>>> apply_basis2D(A,v1,v2)
[[np.nan,1, 2],
[3, 4, np.nan]]
(so for the 3D version then, I'd want apply_basis3D(A,v1,v2,v3)), where A.shape=(n,m,l))
I have a notion that this can be done by affine transformations, but am not really sure how. This is as close an implementation as I could find (2D-only), using scikit-image;
Thanks in advance!
Done! Seems to work quite well, but I welcome critique:
import numpy as np
from scipy.spatial import Delaunay
from scipy.interpolate import interpn
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook
def cartesian_basis2d(A,v1,v2,longest_side=None):
"""convert 2d array in basis v1,v2 to cartesian basis
Properties
----------
A : array((N,M))
values in original basis
v1 : array((2,))
v2 : array((2,))
longest_side : int
longest side (in terms of indexes) of new array
Returns
-------
B : array((P,Q))
where P,Q >= longest_side
"""
longest_side = max(A.shape) if longest_side is None else longest_side
# assumed
origin = [0,0]
# convert to numpy arrays
origin = np.asarray(origin)
v1 = np.asarray(v1)
v2 = np.asarray(v2)
# pre-compute basis transformation matrix
M_inv = np.linalg.inv(np.transpose([v1,v2]))
# only works rigth if transposed before and after?
A = np.array(A).T
# add bounding rows/columns for interpolation
A = np.concatenate((np.array(A[:,0],ndmin=2).T,A,np.array(A[:,-1],ndmin=2).T),axis=1)
A = np.concatenate((np.array(A[0],ndmin=2),A,np.array(A[-1],ndmin=2)),axis=0)
# create axes
axes=[]
for i,v in enumerate([v1,v2]):
step = 1./(A.shape[i]-2)
ax = np.linspace(0,1+step,A.shape[i]) - step/2.
axes.append(ax)
# get bounding box and compute it volume and extents
bbox_pts=np.asarray([origin,v1,v1+v2,v2])
hull = Delaunay(bbox_pts)
bbox_x, bbox_y = bbox_pts.T
new_bounds = bbox_x.min(),bbox_x.max(),bbox_y.min(),bbox_y.max() #l,r,bottom,top
min_bound, max_bound = min(bbox_x.min(),bbox_y.min()), max(bbox_x.max(),bbox_y.max())
# compute new array size
x_length = abs(new_bounds[0]-new_bounds[1])
y_length = abs(new_bounds[2]-new_bounds[3])
if x_length>y_length:
xlen = longest_side
ylen = int(longest_side*y_length/float(x_length))
else:
ylen = longest_side
xlen = int(longest_side*x_length/float(y_length))
# compute new array values
new_array = np.full((xlen,ylen),np.nan)
xidx, yidx = np.meshgrid(range(new_array.shape[0]),range(new_array.shape[1]))
xidx=xidx.flatten()
yidx=yidx.flatten()
xyidx = np.concatenate((np.array(xidx,ndmin=2).T,np.array(yidx,ndmin=2).T),axis=1)
xy = min_bound+(xyidx.astype(float)*abs(min_bound-max_bound)/longest_side)
# find point is inside bounding box
inside_mask = hull.find_simplex(xy)>=0
uv = np.einsum('...jk,...k->...j',M_inv,xy[inside_mask])
new_array[xyidx[inside_mask][:,0],xyidx[inside_mask][:,1]] = interpn(axes,A,uv,bounds_error=True,method='nearest')
new_array = new_array.T
return new_array
A = np.array(
[[1,2,3],
[4,5,6],
[7,8,9]])
v1 = [2,0]
v2 = [np.cos(np.pi/4),np.sin(np.pi/4)]
new_array = cartesian_basis2d(A,v1,v2,100)
plt.imshow(new_array,origin='lower');
def cartesian_basis3d(A,v1,v2,v3,longest_side=None):
"""convert 3d array in basis v1,v2,v3 to cartesian basis
Properties
----------
A : array((N,M))
values in original basis
v1 : array((2,))
v2 : array((2,))
v3 : array((2,))
longest_side : int
longest side (in terms of indexes) of new array
Returns
-------
B : array((P,Q))
where P,Q >= longest_side
"""
longest_side = max(A.shape) if longest_side is None else longest_side
# assumed
origin = [0,0,0]
# convert to numpy arrays
origin = np.asarray(origin)
v1 = np.asarray(v1)
v2 = np.asarray(v2)
v3 = np.asarray(v3)
# pre-compute basis transformation matrix
M_inv = np.linalg.inv(np.transpose([v1,v2,v3]))
# only works rigth if transposed before and after?
A = np.array(A).T
# add bounding layers for interpolation
A = np.concatenate((np.array(A[0],ndmin=3),A,np.array(A[-1],ndmin=3)),axis=0)
start = np.transpose(np.array(A[:,:,0],ndmin=3),axes=[1,2,0])
end = np.transpose(np.array(A[:,:,-1],ndmin=3),axes=[1,2,0])
A = np.concatenate((start,A,end),axis=2)
start = np.transpose(np.array(A[:,0,:],ndmin=3),axes=[1,0,2])
end = np.transpose(np.array(A[:,-1,:],ndmin=3),axes=[1,0,2])
A = np.concatenate((start,A,end),axis=1)
# create axes
axes=[]
for i,v in enumerate([v1,v2,v3]):
step = 1./(A.shape[i]-2)
ax = np.linspace(0,1+step,A.shape[i]) - step/2.
axes.append(ax)
# get bounding box and compute it volume and extents
bbox_pts=np.asarray([origin,v1,v2,v3,v1+v2,v1+v3,v1+v2+v3,v2+v3])
hull = Delaunay(bbox_pts)
bbox_x, bbox_y, bbox_z = bbox_pts.T
new_bounds = bbox_x.min(),bbox_x.max(),bbox_y.min(),bbox_y.max(),bbox_z.min(),bbox_z.max() #l,r,bottom,top
min_bound, max_bound = min(bbox_x.min(),bbox_y.min(),bbox_z.min()), max(bbox_x.max(),bbox_y.max(),bbox_z.min())
# compute new array size
x_length = abs(new_bounds[0]-new_bounds[1])
y_length = abs(new_bounds[2]-new_bounds[3])
z_length = abs(new_bounds[4]-new_bounds[5])
if x_length == max([x_length,y_length,z_length]):
xlen = longest_side
ylen = int(longest_side*y_length/float(x_length))
zlen = int(longest_side*z_length/float(x_length))
elif y_length == max([x_length,y_length,z_length]):
ylen = longest_side
xlen = int(longest_side*x_length/float(y_length))
zlen = int(longest_side*z_length/float(y_length))
else:
zlen = longest_side
xlen = int(longest_side*x_length/float(z_length))
ylen = int(longest_side*y_length/float(z_length))
# compute new array values
new_array = np.full((xlen,ylen,zlen),np.nan)
xidx, yidx, zidx = np.meshgrid(range(new_array.shape[0]),range(new_array.shape[1]),range(new_array.shape[2]))
xidx=xidx.flatten()
yidx=yidx.flatten()
zidx=zidx.flatten()
xyzidx = np.concatenate((np.array(xidx,ndmin=2).T,np.array(yidx,ndmin=2).T,np.array(zidx,ndmin=2).T),axis=1)
xyz = min_bound+(xyzidx.astype(float)*abs(min_bound-max_bound)/longest_side)
# find point is inside bounding box
inside_mask = hull.find_simplex(xyz)>=0
uvw = np.einsum('...jk,...k->...j',M_inv,xyz[inside_mask])
new_array[xyzidx[inside_mask][:,0],xyzidx[inside_mask][:,1],xyzidx[inside_mask][:,2]] = interpn(axes,A,uvw,bounds_error=True,method='nearest')
new_array = new_array.T
return new_array
A = np.array(
[[[1,1],[2,2]],
[[3,3],[4,4]]])
v1 = [2,0,0]
v2 = [np.cos(np.pi/4),np.sin(np.pi/4),0]
v3 = [0,np.cos(np.pi/4),np.sin(np.pi/4)]
new_array = cartesian_basis3d(A,v1,v2,v3,100)
xs,ys,zs = new_array.nonzero()
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
pcm = ax.scatter(xs, ys, zs, c=new_array[xs,ys,zs],cmap='jet')
plt.show()
I have a file with gps coordinates and a scalar value measured from a
bus following some route. I would like to produce a plot with distance
travelled on the x-axis and the the scalar value plotted along the
y-axis. I would like to label the x-axis with kilometers and also
with labels indicating the busstops I am interested in.
To illustrate the problem here is some code to make a MWE for a
similar problem where a bus is travelling in a triangular route with
corners A, B and C at xy-coordinates [0,0], [0,1], [1,0]. The bus is
travelling in a loop A-->B-->C-->A.. etc. The scalar value is x+y
i.e. the sum of the coordiate positions.
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
n = 30
L = 1.0
def generate_route(num_loops):
"""Generates x,y coordinates and scalar value x+y for bus travelling A->B->C num_loop times."""
def get_perturb():
return float(np.random.rand(1)[0] * 0.5*(L/n))
x = []
y = []
for l in range(num_loops):
# A to B
x += [0.0] * n
y += [i*L/float(n) for i in range(0,n)]
# B to C
x += [i*L/float(n) for i in range(0,n)]
y += [1.0-i*L/float(n) for i in range(0,n)]
# C to A
x += [1.0 - i*L/float(n) for i in range(0,n)]
y += [0.0] * n
x = map(lambda i: i + get_perturb(), x)
y = map(lambda i: i + get_perturb(), y)
val = map(lambda i: i[0]+i[1], zip(x,y))
return [x, y, val]
x, y, val = generate_route(3)
# put data into DataFrame
d = {'x':x,'y':y,'val':val}
df = pd.DataFrame(d, index = pd.date_range(dt.datetime.today().replace(microsecond=0),periods=len(x),freq='1s'))
# plot route
plt.figure()
df.plot(x='x', y='y',xlim=[-0.1,1.1],ylim=[-0.1,1.1])
ax = plt.gca()
ax.set_title('Route: x vs y')
# plot bus stops
stops = {'A':[0.,0.], 'B':[0.,1.], 'C':[1.,0.]}
ax.plot(stops['A'][0], stops['A'][1], 'r.', markersize=20)
ax.plot(stops['B'][0], stops['B'][1], 'g.', markersize=20)
ax.plot(stops['C'][0], stops['C'][1], 'y.', markersize=20)
# plt.savefig('route.png')
# compute distance travelled as sum of line segments connecting adjacent readings
df = pd.concat([df,df[['x','y']].rename(columns={'x':'x_prev','y':'y_prev'},copy=True).shift(1)],axis=1).dropna()
df['Dist'] = np.sqrt((df['x']-df['x_prev'])**2 + (df['y']-df['y_prev'])**2)
df['TotalDist'] = df['Dist'].cumsum(0)
# plot value with distance
plt.figure()
df.plot(x='TotalDist', y='val')
ax = plt.gca()
ax.set_title('TotalDist vs val')
# plt.savefig('totaldistvsval.png')
plt.show()
Output figures:
Now I get stuck, I would like to add bus-stop labels along the x-axis
on the TotalDist vs val plot e.g. labelled vertical lines or
similar. I have the following code to label each row of the DataFrame
with the stop it is close to. One problem is that many rows will
match each so I need to pick just one in each set and then I need to use that
to add labels to the x-axis.
def label_stops(row):
def close(p,q):
return np.sqrt((p[0]-q[0])**2+(p[1]-q[1])**2) < 3.0*(L/n)
res = 'None'
for name, loc in stops.iteritems():
if close([row['x'], row['y']], loc):
res = name
return res
df['label'] = df.apply(label_stops, axis = 1)
df
Which gives the following which is some progress:
val x y x_prev y_prev Dist TotalDist label
2014-09-07 14:57:17 0.046516 0.008194 0.038322 0.014114 0.001992 0.036809 0.036809 A
2014-09-07 14:57:18 0.084732 0.014400 0.070333 0.008194 0.038322 0.032607 0.069416 A
2014-09-07 14:57:19 0.122984 0.013296 0.109688 0.014400 0.070333 0.039370 0.108786 None
2014-09-07 14:57:20 0.154545 0.005306 0.149240 0.013296 0.109688 0.040351 0.149137 None
... ... ... ... ... ... ... ... ...
2014-09-07 14:57:42 0.882114 0.007021 0.875094 0.009029 0.839339 0.035811 0.888190 None
2014-09-07 14:57:43 0.923723 0.015505 0.908218 0.007021 0.875094 0.034194 0.922383 B
2014-09-07 14:57:44 0.952783 0.014462 0.938320 0.015505 0.908218 0.030121 0.952504 B
2014-09-07 14:57:45 0.985179 0.009943 0.975237 0.014462 0.938320 0.037192 0.989696 B
2014-09-07 14:57:46 1.010307 0.007226 1.003080 0.009943 0.975237 0.027976 1.017672 B
... ... ... ... ... ... ... ... ...
2014-09-07 15:01:16 1.011478 1.001009 0.010469 0.970733 0.042690 0.044214 9.412063 C
2014-09-07 15:01:17 0.968017 0.967922 0.000095 1.001009 0.010469 0.034676 9.446738 C
2014-09-07 15:01:19 0.921621 0.907178 0.014444 0.934321 0.008302 0.027829 9.509157 C
2014-09-07 15:01:20 0.876492 0.875172 0.001320 0.907178 0.014444 0.034592 9.543749 None
2014-09-07 15:01:21 0.862456 0.846593 0.015863 0.875172 0.001320 0.032066 9.575814 None
... ... ... ... ... ... ... ... ...
I came up with the following which works fine but is probably not idiomatic.
# loop over rows with label column not equal to 'None',
# collect consecutive rows with same location into a
# list (grp) and return list of those lists (groups)
groups = []
grp = []
df1 = df[df['label'] != 'None']
prev = df1.iloc[0]['label']
for r in df1.iterrows():
curr = r[1]['label']
if curr == prev:
grp.append(r[1])
else:
groups.append(grp)
grp = []
prev = curr
# extract stop locations get middle distances
loclines = []
for g in groups:
mids = g[len(g)/2]
loclines.append([mids['TotalDist'], mids['label']])
# mark stops on plot as coloured vertical lines
plt.figure()
df.plot(x='TotalDist', y='val')
ax = plt.gca()
ax.set_title('TotalDist vs val')
for li, l in enumerate(loclines):
if loclines[li][1] == 'A': color = 'r'
if loclines[li][1] == 'B': color = 'g'
if loclines[li][1] == 'C': color = 'y'
plt.axvline(x=loclines[li][0],color= color)
plt.show()
Resulting figure:
I am having a small difficulty with Numpy indexing. The script gives only the index of the last array three times when it supposed to give index of three different arrays (F_fit in the script). I am sure it is a simple thing, but I haven't figured it out yet. The 3_phases.txt file contains these 3 lines
1 -1 -1 -1 1 1
1 1 1 -1 1 1
1 1 -1 -1 -1 1
Here is the code:
import numpy as np
import matplotlib.pyplot as plt
D = 12.96
n = np.arange(1,7)
F0 = 1.0
x = np.linspace(0.001,4,2000)
Q = 2*np.pi*np.array([1/D, 2/D, 3/D, 4/D, 5/D, 6/D])
I = (11.159, 43.857, 26.302, 2.047, 0.513, 0.998)
phase = np.genfromtxt('3_phases.txt')
for row in phase:
F = (np.sqrt(np.square(n)*I/sum(I)))*row
d = sum(i*(np.sin(x*D/2+np.pi*j)/(x*D/2+np.pi*j))for i,j in zip(F,n))
e = sum(i*(np.sin(x*D/2-np.pi*j)/(x*D/2-np.pi*j))for i,j in zip(F,n))
f_0 = F0*(np.sin(x*D/2)/(x*D/2))
F_cont = np.array(d) + np.array(e) + np.array(f_0)
plt.plot(x,F_cont,'r')
#plt.show()
plt.clf()
D2 = 12.3
I2 = (9.4, 38.6, 8.4, 3.25, 0, 0.37)
Q2 = 2*np.pi*np.array([1/D2, 2/D2, 3/D2, 4/D2, 5/D2, 6/D2])
n2 = np.arange(1,7)
for row in phase:
F2 = (np.sqrt(np.square(n2)*I2/sum(I2)))*row
plt.plot(Q2,F2,'o')
#plt.show()
F_data = F2
Q_data = Q2
I_data = np.around(2000*Q2/(4-0.001))
I_data = np.array(map(int,I_data))
F_fit = F_cont[I_data]
print F_fit
R2 = (1-(sum(np.square(F_data-F_fit))/sum(np.square(F_data-np.mean(F_data)))))
Any help would be appreciated.
You are redefining F_cont each time you go through your first loop. By the time you get to your second loop (with all the _2 values) you only have access to the F_cont from the last row.
To fix this, move your _2 definitions above your first loop and only do the loop once, then you'll have access to each F_cont and your printouts will be different.
The following code is identical to yours except for the rearrangement described above, as well as the fact that I implemented my comment from above (using n/D in your Q's).
import numpy as np
import matplotlib.pyplot as plt
D = 12.96
n = np.arange(1,7)
F0 = 1.0
x = np.linspace(0.001,4,2000)
Q = 2*np.pi*n/D
I = (11.159, 43.857, 26.302, 2.047, 0.513, 0.998)
phase = np.genfromtxt('3_phases.txt')
D2 = 12.3
I2 = (9.4, 38.6, 8.4, 3.25, 0, 0.37)
Q2 = 2*np.pi*n/D2
n2 = np.arange(1,7)
for row in phase:
F = (np.sqrt(np.square(n)*I/sum(I)))*row
d = sum(i*(np.sin(x*D/2+np.pi*j)/(x*D/2+np.pi*j))for i,j in zip(F,n))
e = sum(i*(np.sin(x*D/2-np.pi*j)/(x*D/2-np.pi*j))for i,j in zip(F,n))
f_0 = F0*(np.sin(x*D/2)/(x*D/2))
F_cont = np.array(d) + np.array(e) + np.array(f_0)
plt.plot(x,F_cont,'r')
plt.clf()
F2 = (np.sqrt(np.square(n2)*I2/sum(I2)))*row
plt.plot(Q2,F2,'o')
F_data = F2
Q_data = Q2
I_data = np.around(2000*Q2/(4-0.001))
I_data = np.array(map(int,I_data))
F_fit = F_cont[I_data]
print F_fit
R2 = (1-(sum(np.square(F_data-F_fit))/sum(np.square(F_data-np.mean(F_data)))))
F_fit is being calculating from I_data, which is in turn being calculated from Q2. Q2 is set outside the loop, and doesn't depend on row - perhaps you meant I_data to be a function of F2 instead?