Reorder Sankey diagram vertically based on label value - python

I'm trying to plot patient flows between 3 clusters in a Sankey diagram. I have a pd.DataFrame counts with from-to values, see below. To reproduce this DF, here is the counts dict that should be loaded into a pd.DataFrame (which is the input for the visualize_cluster_flow_counts function).
from to value
0 C1_1 C1_2 867
1 C1_1 C2_2 405
2 C1_1 C0_2 2
3 C2_1 C1_2 46
4 C2_1 C2_2 458
... ... ... ...
175 C0_20 C0_21 130
176 C0_20 C2_21 1
177 C2_20 C1_21 12
178 C2_20 C0_21 0
179 C2_20 C2_21 96
The from and to values in the DataFrame represent the cluster number (either 0, 1, or 2) and the amount of days for the x-axis (between 1 and 21). If I plot the Sankey diagram with these values, this is the result:
Code:
import plotly.graph_objects as go
def visualize_cluster_flow_counts(counts):
all_sources = list(set(counts['from'].values.tolist() + counts['to'].values.tolist()))
froms, tos, vals, labs = [], [], [], []
for index, row in counts.iterrows():
froms.append(all_sources.index(row.values[0]))
tos.append(all_sources.index(row.values[1]))
vals.append(row[2])
labs.append(row[3])
fig = go.Figure(data=[go.Sankey(
arrangement='snap',
node = dict(
pad = 15,
thickness = 5,
line = dict(color = "black", width = 0.1),
label = all_sources,
color = "blue"
),
link = dict(
source = froms,
target = tos,
value = vals,
label = labs
))])
fig.update_layout(title_text="Patient flow between clusters over time: 48h (2 days) - 504h (21 days)", font_size=10)
fig.show()
visualize_cluster_flow_counts(counts)
However, I would like to vertically order the bars so that the C0's are always on top, the C1's are always in the middle, and the C2's are always at the bottom (or the other way around, doesn't matter). I know that we can set node.x and node.y to manually assign the coordinates. So, I set the x-values to the amount of days * (1/range of days), which is an increment of +- 0.045. And I set the y-values based on the cluster value: either 0, 0.5 or 1. I then obtain the image below. The vertical order is good, but the vertical margins between the bars are obviously way off; they should be similar to the first result.
The code to produce this is:
import plotly.graph_objects as go
def find_node_coordinates(sources):
x_nodes, y_nodes = [], []
for s in sources:
# Shift each x with +- 0.045
x = float(s.split("_")[-1]) * (1/21)
x_nodes.append(x)
# Choose either 0, 0.5 or 1 for the y-value
cluster_number = s[1]
if cluster_number == "0": y = 1
elif cluster_number == "1": y = 0.5
else: y = 1e-09
y_nodes.append(y)
return x_nodes, y_nodes
def visualize_cluster_flow_counts(counts):
all_sources = list(set(counts['from'].values.tolist() + counts['to'].values.tolist()))
node_x, node_y = find_node_coordinates(all_sources)
froms, tos, vals, labs = [], [], [], []
for index, row in counts.iterrows():
froms.append(all_sources.index(row.values[0]))
tos.append(all_sources.index(row.values[1]))
vals.append(row[2])
labs.append(row[3])
fig = go.Figure(data=[go.Sankey(
arrangement='snap',
node = dict(
pad = 15,
thickness = 5,
line = dict(color = "black", width = 0.1),
label = all_sources,
color = "blue",
x = node_x,
y = node_y,
),
link = dict(
source = froms,
target = tos,
value = vals,
label = labs
))])
fig.update_layout(title_text="Patient flow between clusters over time: 48h (2 days) - 504h (21 days)", font_size=10)
fig.show()
visualize_cluster_flow_counts(counts)
Question: how do I fix the margins of the bars, so that the result looks like the first result? So, for clarity: the bars should be pushed to the bottom. Or is there another way that the Sankey diagram can vertically re-order the bars automatically based on the label value?

Firstly I don't think there is a way with the current exposed API to achieve your goal smoothly you can check the source code here.
Try to change your find_node_coordinates function as follows (note that you should pass the counts DataFrame to):
counts = pd.DataFrame(counts_dict)
def find_node_coordinates(sources, counts):
x_nodes, y_nodes = [], []
flat_on_top = False
range = 1 # The y range
total_margin_width = 0.15
y_range = 1 - total_margin_width
margin = total_margin_width / 2 # From number of Cs
srcs = counts['from'].values.tolist()
dsts = counts['to'].values.tolist()
values = counts['value'].values.tolist()
max_acc = 0
def _calc_day_flux(d=1):
_max_acc = 0
for i in [0,1,2]:
# The first ones
from_source = 'C{}_{}'.format(i,d)
indices = [i for i, val in enumerate(srcs) if val == from_source]
for j in indices:
_max_acc += values[j]
return _max_acc
def _calc_node_io_flux(node_str):
c,d = int(node_str.split('_')[0][-1]), int(node_str.split('_')[1])
_flux_src = 0
_flux_dst = 0
indices_src = [i for i, val in enumerate(srcs) if val == node_str]
indices_dst = [j for j, val in enumerate(dsts) if val == node_str]
for j in indices_src:
_flux_src += values[j]
for j in indices_dst:
_flux_dst += values[j]
return max(_flux_dst, _flux_src)
max_acc = _calc_day_flux()
graph_unit_per_val = y_range / max_acc
print("Graph Unit per Acc Val", graph_unit_per_val)
for s in sources:
# Shift each x with +- 0.045
d = int(s.split("_")[-1])
x = float(d) * (1/21)
x_nodes.append(x)
print(s, _calc_node_io_flux(s))
# Choose either 0, 0.5 or 1 for the y-v alue
cluster_number = s[1]
# Flat on Top
if flat_on_top:
if cluster_number == "0":
y = _calc_node_io_flux('C{}_{}'.format(2, d))*graph_unit_per_val + margin + _calc_node_io_flux('C{}_{}'.format(1, d))*graph_unit_per_val + margin + _calc_node_io_flux('C{}_{}'.format(0, d))*graph_unit_per_val/2
elif cluster_number == "1": y = _calc_node_io_flux('C{}_{}'.format(2, d))*graph_unit_per_val + margin + _calc_node_io_flux('C{}_{}'.format(1, d))*graph_unit_per_val/2
else: y = 1e-09
# Flat On Bottom
else:
if cluster_number == "0": y = 1 - (_calc_node_io_flux('C{}_{}'.format(0,d))*graph_unit_per_val / 2)
elif cluster_number == "1": y = 1 - (_calc_node_io_flux('C{}_{}'.format(0,d))*graph_unit_per_val + margin + _calc_node_io_flux('C{}_{}'.format(1,d)) * graph_unit_per_val /2 )
elif cluster_number == "2": y = 1 - (_calc_node_io_flux('C{}_{}'.format(0,d))*graph_unit_per_val + margin + _calc_node_io_flux('C{}_{}'.format(1,d)) * graph_unit_per_val + margin + _calc_node_io_flux('C{}_{}'.format(2,d)) * graph_unit_per_val /2 )
y_nodes.append(y)
return x_nodes, y_nodes
Sankey graphs supposed to weigh their connection width by their corresponding normalized values right? Here I do the same, first, it calculates each node flux, later by calculating the normalized coordinate the center of each node calculated according to their flux.
Here is the sample output of your code with the modified function, note that I tried to adhere to your code as much as possible so it's a bit unoptimized(for example, one could store the values of nodes above each specified source node to avoid its flux recalculation).
With flag flat_on_top = True
With flag flat_on_top = False
There is a bit of inconsistency in the flat_on_bottom version which I think is caused by the padding or other internal sources of Plotly API.

Related

How to highlight the lowest line on a linegraph in matplotlib?

Currently, my code generates a line graph based on an array of x,y values generated from a function called f(), like so:
T = 0
for i in range(0,10):
#function f generates array of values based on T to plot x,y
x,y = f(T)
plt.plot(x, y, label = "T={}".format(T))
T += 1
This generates a graph like so:
Is there a streamlined way to make all of the lines a grey, highlighting the line with the lowest endpoint with red and highest endpoint with green, on the x-axis, regardless of what y is?
So for this example, where T=5 the line would be red and where T=3 the line would be green, and for the other lines all the same shade of grey.
Simply store all your x and y values in two lists :
X = [x0,..., x9] # List of lists.
Y = [y0,..., y9] # Same. x0, y0 = f(0)
Then find the highest and lowest value :
highest_endpoint, highest_endpoint_indice = Y[0][-1], 0 # Initialisation.
lowest_endpoint, lowest_endpoint_indice = Y[0][-1], 0 # Initialisation.
for i, y in enumerate(Y[1:]) : # No need to check for Y[0] = y0 thanks to the initialisations.
if y[-1] > highest_endpoint : # If current endpoint is superior to temporary highest endpoint.
highest_endpoint, highest_endpoint_indice = y[-1], i+1
elif y[-1] < lowest_endpoint :
lowest_endpoint, lowest_endpoint_indice = y[-1], i+1
# Plot the curves.
for T in range(10) :
if T == highest_endpoint_indice :
plt.plot(X[T], Y[T], label = "T={}".format(T), color = 'green')
elif T == lowest_endpoint_indice :
plt.plot(X[T], Y[T], label = "T={}".format(T), color = 'red')
else :
plt.plot(X[T], Y[T], label = "T={}".format(T), color = 'gray')
plt.show()

How to plot 3 or more values in plot.bar()

I tried to make plot.bar() using 2 values having them in a list, but I'm unable to plot 3 values.
I tried to add plot.bar(x,y,z), but it didn't work.
ce_data = ce_data.drop(
['pchangeinOpenInterest', 'totalTradedVolume', 'impliedVolatility', # this removes unecesssary items
'pChange', 'totalBuyQuantity', 'totalSellQuantity', 'bidQty',
'bidprice', 'askQty', 'askPrice', 'askQty', 'identifier', 'lastPrice', 'change', 'expiryDate',
'underlying'], axis=1)[
['openInterest', 'changeinOpenInterest', 'strikePrice', 'underlyingValue']]
style.use('ggplot')
ce_data.to_csv('kumar.csv')
df = pd.read_csv('kumar.csv', parse_dates=True, index_col=0)
pivot = df.iloc[2, 3] # this selects the strike price
pivot_round = round(pivot, -2) # round of the price
x = df['strikePrice'].tolist()
y = df['changeinOpenInterest'].tolist()
z = df['openInterest'].tolist()
for i in range(len(x)):
if int(x[i]) >= pivot_round - 400:
xleftpos = i
break
for i in range(len(x)):
if int(x[i]) >= pivot_round + 400:
xrightpos = i
break
x = x[xleftpos:xrightpos]
y = y[xleftpos:xrightpos]
z = z[xleftpos:xrightpos]
plot.bar([value for value in range(len(x))],y)
plot.set_xticks([idx + 0.5 for idx in range(len(x))])
plot.set_xticklabels(x, rotation=35, ha='right', size=10)
I am expecting strike price in x axis and y and z (change in oi and oi) in as bars.
IIUC, here's how I'd do it. This should have a single x-axis w/ 'strikePrice' and two bars of 'changeinOpenInterest' and 'openInterest'.
disp_df = df.pivot('strikePrice', 'changeinOpenInterest', 'openInterest')
disp_df.plot(kind='bar')
You can add the bells and whistles you want to the plot, but this avoids a lot of the manipulation you did above.

How to plot k-means clustering results in an ordered way

I am using k-mans clustering as a means of customer and product segmentation. I found a function on stack that takes the cluster results and reorders them based on the average value of a target value in the dataframe. This seems to be working quite well but in order to plot the results I am first creating a string column in the data frame based on the ordered clustering to prevent seaborn from creating bins in the hue labels. The first problem I ran into was that while the plot and labels were being generated as intended the legend was out of order. I added a hue order but the ledgend becomes fixed to this order so changing the value of K makes the legend confusing. I added a function to address this problem as well and everything seems to be working as intended but I would like to know if there are any better ways of achiving this. I will place the related code blocks bellow.
#function for ordering cluster numbers
def order_cluster(cluster_field_name, target_field_name,df,ascending):
new_cluster_field_name = 'new_' + cluster_field_name
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
df_new['index'] = df_new.index
df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
df_final = df_final.drop([cluster_field_name],axis=1)
df_final = df_final.rename(columns={"index":cluster_field_name})
return df_final
#adding column to dataframe based on clustering
kmeans = KMeans(n_clusters=4)
kmeans.fit(data[['ORDERS_PLACED','UNITS_SOLD','AVG_UNIT_PRICE','TOTAL_SALES']])
data['Rank'] = kmeans.predict(data[['ORDERS_PLACED','UNITS_SOLD','AVG_UNIT_PRICE','TOTAL_SALES']])
#ordering the results
data = order_cluster('Rank','TOTAL_SALES',data,True)
top = data['Rank'].max()
#adding string column to dataframe
data['Rank_ID'] = [('Group_A' if x == top else
('Group_B' if x == top - 1 else
('Group_C' if x == top - 2 else
('Group_D' if x == top - 3 else
('Group_E' if x == top - 4 else
('Group_F' if x == top - 5 else
('Group_G' if x == top - 6 else
('Group_H' if x == top - 7 else
('Group_I' if x == top - 8 else
('Group_J' if x == top - 9 else 'Group_Z')))))))))
) for x in data['Rank']]
#function to build the plot legend values
def build_legend(k_value):
if k_value == 0:
legend = ['Group_A']
elif k_value == 1:
legend = ['Group_A','Group_B']
elif k_value == 2:
legend = ['Group_A','Group_B','Group_C']
elif k_value == 3:
legend = ['Group_A','Group_B','Group_C','Group_D']
elif k_value == 4:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E']
elif k_value == 5:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E','Group_F']
elif k_value == 6:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E','Group_F','Group_G']
elif k_value == 7:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E','Group_F','Group_G','Group_H']
elif k_value == 8:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E','Group_F','Group_G','Group_H','Group_I']
elif k_value == 9:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E','Group_F','Group_G','Group_H','Group_I','Group_J']
else:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E','Group_F','Group_G','Group_H','Group_I','Group_J','Group_Z']
return legend
#plotting the results
orderHue = build_legend(top)
fig, ax = plt.subplots(figsize=(12,5))
plot = sns.scatterplot(x='ORDERS_PLACED', y='TOTAL_SALES', hue='Rank_ID', size='Rank_ID',
hue_order=orderHue, size_order=orderHue, data=report, ax=ax)
ytick = plot.get_yticks()
plot.set_yticklabels(['{:,.0f}'.format(x) for x in ytick])
plot.set_title('80/20 Customer Segmentation Using K-Means Clustering, Plot on Orders Placed & Total Sales',fontsize=12)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.show(plot)
This seems like a lot of code to achive what might be quite simple.
Here is a quick sample of the data as requested,
CUSTOMER_ID ORDERS_PLACED UNITS_SOLD AVG_UNIT_PRICE TOTAL_SALES
A 2 59 21553.9 1271680
B 106 184 6295.9 1158445.7
C 13 78 14290 1114620
D 43 2034 245.38 499102
E 53 582 760.92 442856
F 1 6 15000 90000
G 3 60 967 58020
H 1 1 1807 1807

Stacked bar graph with variable width elements?

In Tableau I'm used to making graphs like the one below. It has for each day (or some other discrete variable), a stacked bar of categories of different colours, heights and widths.
You can imagine the categories to be different advertisements that I show to people. The heights correspond to the percentage of people I've shown the advertisement to, and the widths correspond to the rate of acceptance.
It allows me to see very easily which advertisements I should probably show more often (short, but wide bars, like the 'C' category on September 13th and 14th) and which I should show less often (tall, narrow bars, like the 'H' category on September 16th).
Any ideas on how I could create a graph like this in R or Python?
Unfortunately, this is not so trivial to achieve with ggplot2 (I think), because geom_bar does not really support changing widths for the same x position. But with a bit of effort, we can achieve the same result:
Create some fake data
set.seed(1234)
d <- as.data.frame(expand.grid(adv = LETTERS[1:7], day = 1:5))
d$height <- runif(7*5, 1, 3)
d$width <- runif(7*5, 0.1, 0.3)
My data doesn't add up to 100%, cause I'm lazy.
head(d, 10)
# adv day height width
# 1 A 1 1.227407 0.2519341
# 2 B 1 2.244599 0.1402496
# 3 C 1 2.218549 0.1517620
# 4 D 1 2.246759 0.2984301
# 5 E 1 2.721831 0.2614705
# 6 F 1 2.280621 0.2106667
# 7 G 1 1.018992 0.2292812
# 8 A 2 1.465101 0.1623649
# 9 B 2 2.332168 0.2243638
# 10 C 2 2.028502 0.1659540
Make a new variable for stacking
We can't easily use position_stack I think, so we'll just do that part ourselves. Basically, we need to calculate the cumulative height for every bar, grouped by day. Using dplyr we can do that very easily.
library(dplyr)
d2 <- d %>% group_by(day) %>% mutate(cum_height = cumsum(height))
Make the plot
Finally, we create the plot. Note that the x and y refer to the middle of the tiles.
library(ggplot2)
ggplot(d2, aes(x = day, y = cum_height - 0.5 * height, fill = adv)) +
geom_tile(aes(width = width, height = height), show.legend = FALSE) +
geom_text(aes(label = adv)) +
scale_fill_brewer(type = 'qual', palette = 2) +
labs(title = "Views and other stuff", y = "% of views")
If you don't want to play around with correctly scaling the widths (to something < 1), you can use facets instead:
ggplot(d2, aes(x = 1, y = cum_height - 0.5 * height, fill = adv)) +
geom_tile(aes(width = width, height = height), show.legend = FALSE) +
geom_text(aes(label = adv)) +
facet_grid(~day) +
scale_fill_brewer(type = 'qual', palette = 2) +
labs(title = "Views and other stuff", y = "% of views", x = "")
Result
set.seed(1)
days <- 5
cats <- 8
dat <- prop.table(matrix(rpois(days * cats, days), cats), 2)
bp1 <- barplot(dat, col = seq(cats))
## some width for rect
rate <- matrix(runif(days * cats, .1, .5), cats)
## calculate xbottom, xtop, ybottom, ytop
bp <- rep(bp1, each = cats)
ybot <- apply(rbind(0, dat), 2, cumsum)[-(cats + 1), ]
ytop <- apply(dat, 2, cumsum)
plot(extendrange(bp1), c(0,1), type = 'n', axes = FALSE, ann = FALSE)
rect(bp - rate, ybot, bp + rate, ytop, col = seq(cats))
text(bp, (ytop + ybot) / 2, LETTERS[seq(cats)])
axis(1, bp1, labels = format(Sys.Date() + seq(days), '%d %b %Y'), lwd = 0)
axis(2)
Probably not very useful, but you can invert the color you are plotting so that you can actually see the labels:
inv_col <- function(color) {
paste0('#', apply(apply(rbind(abs(255 - col2rgb(color))), 2, function(x)
format(as.hexmode(x), 2)), 2, paste, collapse = ''))
}
inv_col(palette())
# [1] "#ffffff" "#00ffff" "#ff32ff" "#ffff00" "#ff0000" "#00ff00" "#0000ff" "#414141"
plot(extendrange(bp1), c(0,1), type = 'n', axes = FALSE, ann = FALSE)
rect(bp - rate, ybot, bp + rate, ytop, col = seq(cats), xpd = NA, border = NA)
text(bp, (ytop + ybot) / 2, LETTERS[seq(cats)], col = inv_col(seq(cats)))
axis(1, bp1, labels = format(Sys.Date() + seq(days), '%d %B\n%Y'), lwd = 0)
axis(2)

Compute unique axis labels according to gps data in pandas DataFrame

I have a file with gps coordinates and a scalar value measured from a
bus following some route. I would like to produce a plot with distance
travelled on the x-axis and the the scalar value plotted along the
y-axis. I would like to label the x-axis with kilometers and also
with labels indicating the busstops I am interested in.
To illustrate the problem here is some code to make a MWE for a
similar problem where a bus is travelling in a triangular route with
corners A, B and C at xy-coordinates [0,0], [0,1], [1,0]. The bus is
travelling in a loop A-->B-->C-->A.. etc. The scalar value is x+y
i.e. the sum of the coordiate positions.
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
n = 30
L = 1.0
def generate_route(num_loops):
"""Generates x,y coordinates and scalar value x+y for bus travelling A->B->C num_loop times."""
def get_perturb():
return float(np.random.rand(1)[0] * 0.5*(L/n))
x = []
y = []
for l in range(num_loops):
# A to B
x += [0.0] * n
y += [i*L/float(n) for i in range(0,n)]
# B to C
x += [i*L/float(n) for i in range(0,n)]
y += [1.0-i*L/float(n) for i in range(0,n)]
# C to A
x += [1.0 - i*L/float(n) for i in range(0,n)]
y += [0.0] * n
x = map(lambda i: i + get_perturb(), x)
y = map(lambda i: i + get_perturb(), y)
val = map(lambda i: i[0]+i[1], zip(x,y))
return [x, y, val]
x, y, val = generate_route(3)
# put data into DataFrame
d = {'x':x,'y':y,'val':val}
df = pd.DataFrame(d, index = pd.date_range(dt.datetime.today().replace(microsecond=0),periods=len(x),freq='1s'))
# plot route
plt.figure()
df.plot(x='x', y='y',xlim=[-0.1,1.1],ylim=[-0.1,1.1])
ax = plt.gca()
ax.set_title('Route: x vs y')
# plot bus stops
stops = {'A':[0.,0.], 'B':[0.,1.], 'C':[1.,0.]}
ax.plot(stops['A'][0], stops['A'][1], 'r.', markersize=20)
ax.plot(stops['B'][0], stops['B'][1], 'g.', markersize=20)
ax.plot(stops['C'][0], stops['C'][1], 'y.', markersize=20)
# plt.savefig('route.png')
# compute distance travelled as sum of line segments connecting adjacent readings
df = pd.concat([df,df[['x','y']].rename(columns={'x':'x_prev','y':'y_prev'},copy=True).shift(1)],axis=1).dropna()
df['Dist'] = np.sqrt((df['x']-df['x_prev'])**2 + (df['y']-df['y_prev'])**2)
df['TotalDist'] = df['Dist'].cumsum(0)
# plot value with distance
plt.figure()
df.plot(x='TotalDist', y='val')
ax = plt.gca()
ax.set_title('TotalDist vs val')
# plt.savefig('totaldistvsval.png')
plt.show()
Output figures:
Now I get stuck, I would like to add bus-stop labels along the x-axis
on the TotalDist vs val plot e.g. labelled vertical lines or
similar. I have the following code to label each row of the DataFrame
with the stop it is close to. One problem is that many rows will
match each so I need to pick just one in each set and then I need to use that
to add labels to the x-axis.
def label_stops(row):
def close(p,q):
return np.sqrt((p[0]-q[0])**2+(p[1]-q[1])**2) < 3.0*(L/n)
res = 'None'
for name, loc in stops.iteritems():
if close([row['x'], row['y']], loc):
res = name
return res
df['label'] = df.apply(label_stops, axis = 1)
df
Which gives the following which is some progress:
val x y x_prev y_prev Dist TotalDist label
2014-09-07 14:57:17 0.046516 0.008194 0.038322 0.014114 0.001992 0.036809 0.036809 A
2014-09-07 14:57:18 0.084732 0.014400 0.070333 0.008194 0.038322 0.032607 0.069416 A
2014-09-07 14:57:19 0.122984 0.013296 0.109688 0.014400 0.070333 0.039370 0.108786 None
2014-09-07 14:57:20 0.154545 0.005306 0.149240 0.013296 0.109688 0.040351 0.149137 None
... ... ... ... ... ... ... ... ...
2014-09-07 14:57:42 0.882114 0.007021 0.875094 0.009029 0.839339 0.035811 0.888190 None
2014-09-07 14:57:43 0.923723 0.015505 0.908218 0.007021 0.875094 0.034194 0.922383 B
2014-09-07 14:57:44 0.952783 0.014462 0.938320 0.015505 0.908218 0.030121 0.952504 B
2014-09-07 14:57:45 0.985179 0.009943 0.975237 0.014462 0.938320 0.037192 0.989696 B
2014-09-07 14:57:46 1.010307 0.007226 1.003080 0.009943 0.975237 0.027976 1.017672 B
... ... ... ... ... ... ... ... ...
2014-09-07 15:01:16 1.011478 1.001009 0.010469 0.970733 0.042690 0.044214 9.412063 C
2014-09-07 15:01:17 0.968017 0.967922 0.000095 1.001009 0.010469 0.034676 9.446738 C
2014-09-07 15:01:19 0.921621 0.907178 0.014444 0.934321 0.008302 0.027829 9.509157 C
2014-09-07 15:01:20 0.876492 0.875172 0.001320 0.907178 0.014444 0.034592 9.543749 None
2014-09-07 15:01:21 0.862456 0.846593 0.015863 0.875172 0.001320 0.032066 9.575814 None
... ... ... ... ... ... ... ... ...
I came up with the following which works fine but is probably not idiomatic.
# loop over rows with label column not equal to 'None',
# collect consecutive rows with same location into a
# list (grp) and return list of those lists (groups)
groups = []
grp = []
df1 = df[df['label'] != 'None']
prev = df1.iloc[0]['label']
for r in df1.iterrows():
curr = r[1]['label']
if curr == prev:
grp.append(r[1])
else:
groups.append(grp)
grp = []
prev = curr
# extract stop locations get middle distances
loclines = []
for g in groups:
mids = g[len(g)/2]
loclines.append([mids['TotalDist'], mids['label']])
# mark stops on plot as coloured vertical lines
plt.figure()
df.plot(x='TotalDist', y='val')
ax = plt.gca()
ax.set_title('TotalDist vs val')
for li, l in enumerate(loclines):
if loclines[li][1] == 'A': color = 'r'
if loclines[li][1] == 'B': color = 'g'
if loclines[li][1] == 'C': color = 'y'
plt.axvline(x=loclines[li][0],color= color)
plt.show()
Resulting figure:

Categories

Resources