How to neaten up this code into a more Pythonic way? - python

I have plotted a box and whiskers plot for my data using the following code:
def make_labels(ax, boxplot):
iqr = boxplot['boxes'][0]
caps = boxplot['caps']
med = boxplot['medians'][0]
fly = boxplot['fliers'][0]
xpos = med.get_xdata()
xoff = 0.1 * (xpos[1] - xpos[0])
xlabel = xpos[1] + xoff
median = med.get_ydata()[1]
pc25 = iqr.get_ydata().min()
pc75 = iqr.get_ydata().max()
capbottom = caps[0].get_ydata()[0]
captop = caps[1].get_ydata()[0]
ax.text(xlabel, median, 'Median = {:6.3g}'.format(median), va='center')
ax.text(xlabel, pc25, '25th percentile = {:6.3g}'.format(pc25), va='center')
ax.text(xlabel, pc75, '75th percentile = {:6.3g}'.format(pc75), va='center')
ax.text(xlabel, capbottom, 'Bottom cap = {:6.3g}'.format(capbottom), va='center')
ax.text(xlabel, captop, 'Top cap = {:6.3g}'.format(captop), va='center')
for flier in fly.get_ydata():
ax.text(1 + xoff, flier, 'Flier = {:6.3g}'.format(flier), va='center')
and this gives me the following graph:
Now, what I want to do is to grab all the 'Flier' points that we can see in the graph and make it into a list and for that I did the following:
fliers_data = []
def boxplots(boxplot):
iqr = boxplot['boxes'][0]
fly = boxplot['fliers'][0]
pc25 = iqr.get_ydata().min()
pc75 = iqr.get_ydata().max()
inter_quart_range = pc75 - pc25
max_q3 = pc75 + 1.5*inter_quart_range
min_q1 = pc25 - 1.5*inter_quart_range
for flier in fly.get_ydata():
if (flier > max_q3):
fliers_data.append(flier)
elif (flier < min_q1):
fliers_data.append(flier)
Now, I have 2 queries:
In both functions, there are a few lines that are similar. Is there a way I can define them once and use them in both the functions?
Can the second function be edited or neatened in a more efficient way?

I think mostly its quite neat, the only thing I can suggest is spaces between different parts of the functions and maybe some quotes to tell someone reading what each part does?
Something like this, for example:
def myfunction(x):
# checking if x equals 10
if x == 10:
return True
# if equals 0 return string
elif x == 0:
return "equals zero"
# else return false
else:
return False
Also, I think you can locate any variables that are the same outside and before both functions (say, at the very start of your code) they should still be accessible in the functions.

Related

Reorder Sankey diagram vertically based on label value

I'm trying to plot patient flows between 3 clusters in a Sankey diagram. I have a pd.DataFrame counts with from-to values, see below. To reproduce this DF, here is the counts dict that should be loaded into a pd.DataFrame (which is the input for the visualize_cluster_flow_counts function).
from to value
0 C1_1 C1_2 867
1 C1_1 C2_2 405
2 C1_1 C0_2 2
3 C2_1 C1_2 46
4 C2_1 C2_2 458
... ... ... ...
175 C0_20 C0_21 130
176 C0_20 C2_21 1
177 C2_20 C1_21 12
178 C2_20 C0_21 0
179 C2_20 C2_21 96
The from and to values in the DataFrame represent the cluster number (either 0, 1, or 2) and the amount of days for the x-axis (between 1 and 21). If I plot the Sankey diagram with these values, this is the result:
Code:
import plotly.graph_objects as go
def visualize_cluster_flow_counts(counts):
all_sources = list(set(counts['from'].values.tolist() + counts['to'].values.tolist()))
froms, tos, vals, labs = [], [], [], []
for index, row in counts.iterrows():
froms.append(all_sources.index(row.values[0]))
tos.append(all_sources.index(row.values[1]))
vals.append(row[2])
labs.append(row[3])
fig = go.Figure(data=[go.Sankey(
arrangement='snap',
node = dict(
pad = 15,
thickness = 5,
line = dict(color = "black", width = 0.1),
label = all_sources,
color = "blue"
),
link = dict(
source = froms,
target = tos,
value = vals,
label = labs
))])
fig.update_layout(title_text="Patient flow between clusters over time: 48h (2 days) - 504h (21 days)", font_size=10)
fig.show()
visualize_cluster_flow_counts(counts)
However, I would like to vertically order the bars so that the C0's are always on top, the C1's are always in the middle, and the C2's are always at the bottom (or the other way around, doesn't matter). I know that we can set node.x and node.y to manually assign the coordinates. So, I set the x-values to the amount of days * (1/range of days), which is an increment of +- 0.045. And I set the y-values based on the cluster value: either 0, 0.5 or 1. I then obtain the image below. The vertical order is good, but the vertical margins between the bars are obviously way off; they should be similar to the first result.
The code to produce this is:
import plotly.graph_objects as go
def find_node_coordinates(sources):
x_nodes, y_nodes = [], []
for s in sources:
# Shift each x with +- 0.045
x = float(s.split("_")[-1]) * (1/21)
x_nodes.append(x)
# Choose either 0, 0.5 or 1 for the y-value
cluster_number = s[1]
if cluster_number == "0": y = 1
elif cluster_number == "1": y = 0.5
else: y = 1e-09
y_nodes.append(y)
return x_nodes, y_nodes
def visualize_cluster_flow_counts(counts):
all_sources = list(set(counts['from'].values.tolist() + counts['to'].values.tolist()))
node_x, node_y = find_node_coordinates(all_sources)
froms, tos, vals, labs = [], [], [], []
for index, row in counts.iterrows():
froms.append(all_sources.index(row.values[0]))
tos.append(all_sources.index(row.values[1]))
vals.append(row[2])
labs.append(row[3])
fig = go.Figure(data=[go.Sankey(
arrangement='snap',
node = dict(
pad = 15,
thickness = 5,
line = dict(color = "black", width = 0.1),
label = all_sources,
color = "blue",
x = node_x,
y = node_y,
),
link = dict(
source = froms,
target = tos,
value = vals,
label = labs
))])
fig.update_layout(title_text="Patient flow between clusters over time: 48h (2 days) - 504h (21 days)", font_size=10)
fig.show()
visualize_cluster_flow_counts(counts)
Question: how do I fix the margins of the bars, so that the result looks like the first result? So, for clarity: the bars should be pushed to the bottom. Or is there another way that the Sankey diagram can vertically re-order the bars automatically based on the label value?
Firstly I don't think there is a way with the current exposed API to achieve your goal smoothly you can check the source code here.
Try to change your find_node_coordinates function as follows (note that you should pass the counts DataFrame to):
counts = pd.DataFrame(counts_dict)
def find_node_coordinates(sources, counts):
x_nodes, y_nodes = [], []
flat_on_top = False
range = 1 # The y range
total_margin_width = 0.15
y_range = 1 - total_margin_width
margin = total_margin_width / 2 # From number of Cs
srcs = counts['from'].values.tolist()
dsts = counts['to'].values.tolist()
values = counts['value'].values.tolist()
max_acc = 0
def _calc_day_flux(d=1):
_max_acc = 0
for i in [0,1,2]:
# The first ones
from_source = 'C{}_{}'.format(i,d)
indices = [i for i, val in enumerate(srcs) if val == from_source]
for j in indices:
_max_acc += values[j]
return _max_acc
def _calc_node_io_flux(node_str):
c,d = int(node_str.split('_')[0][-1]), int(node_str.split('_')[1])
_flux_src = 0
_flux_dst = 0
indices_src = [i for i, val in enumerate(srcs) if val == node_str]
indices_dst = [j for j, val in enumerate(dsts) if val == node_str]
for j in indices_src:
_flux_src += values[j]
for j in indices_dst:
_flux_dst += values[j]
return max(_flux_dst, _flux_src)
max_acc = _calc_day_flux()
graph_unit_per_val = y_range / max_acc
print("Graph Unit per Acc Val", graph_unit_per_val)
for s in sources:
# Shift each x with +- 0.045
d = int(s.split("_")[-1])
x = float(d) * (1/21)
x_nodes.append(x)
print(s, _calc_node_io_flux(s))
# Choose either 0, 0.5 or 1 for the y-v alue
cluster_number = s[1]
# Flat on Top
if flat_on_top:
if cluster_number == "0":
y = _calc_node_io_flux('C{}_{}'.format(2, d))*graph_unit_per_val + margin + _calc_node_io_flux('C{}_{}'.format(1, d))*graph_unit_per_val + margin + _calc_node_io_flux('C{}_{}'.format(0, d))*graph_unit_per_val/2
elif cluster_number == "1": y = _calc_node_io_flux('C{}_{}'.format(2, d))*graph_unit_per_val + margin + _calc_node_io_flux('C{}_{}'.format(1, d))*graph_unit_per_val/2
else: y = 1e-09
# Flat On Bottom
else:
if cluster_number == "0": y = 1 - (_calc_node_io_flux('C{}_{}'.format(0,d))*graph_unit_per_val / 2)
elif cluster_number == "1": y = 1 - (_calc_node_io_flux('C{}_{}'.format(0,d))*graph_unit_per_val + margin + _calc_node_io_flux('C{}_{}'.format(1,d)) * graph_unit_per_val /2 )
elif cluster_number == "2": y = 1 - (_calc_node_io_flux('C{}_{}'.format(0,d))*graph_unit_per_val + margin + _calc_node_io_flux('C{}_{}'.format(1,d)) * graph_unit_per_val + margin + _calc_node_io_flux('C{}_{}'.format(2,d)) * graph_unit_per_val /2 )
y_nodes.append(y)
return x_nodes, y_nodes
Sankey graphs supposed to weigh their connection width by their corresponding normalized values right? Here I do the same, first, it calculates each node flux, later by calculating the normalized coordinate the center of each node calculated according to their flux.
Here is the sample output of your code with the modified function, note that I tried to adhere to your code as much as possible so it's a bit unoptimized(for example, one could store the values of nodes above each specified source node to avoid its flux recalculation).
With flag flat_on_top = True
With flag flat_on_top = False
There is a bit of inconsistency in the flat_on_bottom version which I think is caused by the padding or other internal sources of Plotly API.

Python - Matplotlib y-axis confusion (scaling)

how can i solve this problem
I could not find the solution to fix this confusion.
My data is lost when I scale it.
When I give minimum and maximum values, the confusion continues again.
def ohlcPlot(data,name):
for i in range(len(data)):
plt.vlines(x = i,ymin = data.iloc[i,2],ymax = data.iloc[i,1],color = "black",linewidth = 1)
if(data.iloc[i,3] > data.iloc[i,0]):
plt.vlines(x = i,ymin=data.iloc[i,0],ymax=data.iloc[i,3],color="green",linewidth=4)
if(data.iloc[i,3] < data.iloc[i,0]):
plt.vlines(x=i,ymin=data.iloc[i,3],ymax=data.iloc[i,0],color="red",linewidth=4)
if(data.iloc[i,3] == data.iloc[i,0]):
plt.vlines(x=i,ymin=data.iloc[i,3],ymax=data.iloc[i,0],color="black",linewidth=4)
plt.figure(figsize=plt.figaspect(0.4)) #en-boy oran: 0.4
plt.grid()
plt.title(name)

Having trouble getting subplots to show up correctly with larger data sets

I am having trouble getting subplots to show up correctly with larger data sets.
I am ok with having the figure grow for my application. I am also ok with having the figure grow such that all the graphs would be about the size of the ones showing up in the small data set example if that is possible. (anaconda3/v4.2.0/python)
plt.rcParams['figure.autolayout']=True
figa, axa = plt.subplots(rowcnt, colcnt)
figa.suptitle("Users Disk Space Usage Over Time.\n")
ax_index = 0
for r in range(rowcnt)
for c in range(colcnt):
n = r * c
user = gr.columns[n]
ur = gr[user]
x = ur.index
y = ur.values
while is_color_like(colorpairs[colorindex]) == False or is_color_like(colorpairs[colorindex+1]) == False :
colorindex = int((colorindex + 2) % (len(colorpairs)/2))
axa[r,c].plot(x, y, color=colorpairs[colorindex+1], alpha=0.6)
plt.setp(axa[r,c].get_xticklabels(), rotation=30)
if len(x) > 1:
axa[r,c].fill_between(x, y, color=colorpairs[colorindex],alpha=0.4)
axa[r,c].set_ylim(0,disksizebytes)
axa[r,c].set_title(user)
axa[r,c].set_xlabel('date')
axa[r,c].set_ylabel('space used')
axa[r,c].grid(True)
i += 1
colorindex = int((colorindex + 2) % (len(colorpairs)/2))
detailarryimage = "{}/detailarryimage.png".format(datafolder)
figa.savefig(detailarryimage)
Small Set Image
Large Set Image

Bokeh equivalent of Matplotlib scatter_matrix

Is there a better way of reproducing matplotlibs scatter_matrix (plot all data against all data) in Bokeh than the code below:
defaults.width = 100
defaults.height = 100
scatter_plots = []
y_max = len(dataset.columns)-1
for i, y_col in enumerate(dataset):
for j, x_col in enumerate(dataset):
df = pd.DataFrame({x_col: dataset[x_col].tolist(), y_col: dataset[y_col].tolist()})
p = Scatter(df, x=x_col, y=y_col)
if j > 0:
p.yaxis.axis_label = ""
p.yaxis.visible = False
if i < y_max:
p.xaxis.axis_label = ""
p.xaxis.visible = False
scatter_plots.append(p)
grid = gridplot(scatter_plots, ncols = len(dataset.columns))
show(grid)
In particular I would like to be able to zoom and pan the entire grid of plots as a single entity rather than zoom/pan the subplot the mouse is hovering over.
In general, to have linked panning/zooming, you share the ranges that you want to be linked between plots. This is described here in the Users Guide:
https://docs.bokeh.org/en/latest/docs/user_guide/interaction/linking.html
You can also check out this linked SPLOM example:
https://github.com/bokeh/bokeh/blob/master/examples/models/iris_splom.py
That example is longer/more verbose because it uses the low level bokeh.models API. The important part is where it re-uses the ranges xdr and ydr on ever plot that gets created.
In your particular case, since high level charts don't accept range parameters up front (IIRC), I think you'll have to fix up the charts "after the fact", so maybe something like:
xr = scatter_plots[0].x_range
yr = scatter_plots[0].y_range
for p in scatter_plots:
p.x_range = xr
p.y_range = yr
In case it is useful, I faced the same problem. In actual fact you don't want all the axis linked - but rather each rows y-axis linked and each columns x-axis linked. I'm surprised that this isn't a built in bokeh feature. even iris the example gets this wrong:
http://docs.bokeh.org/en/latest/docs/gallery/iris_splom.html
Here's a code snippet I used:
def scatter_matrix(dataset):
dataset_source = ColumnDataSource(data=dataset)
scatter_plots = []
y_max = len(dataset.columns)-1
for i, y_col in enumerate(dataset.columns):
for j, x_col in enumerate(dataset.columns):
p = figure(plot_width=100, plot_height=100, x_axis_label=x_col, y_axis_label=y_col)
p.circle(source=dataset_source,x=x_col, y=y_col, fill_alpha=0.3, line_alpha=0.3, size=3)
if j > 0:
p.yaxis.axis_label = ""
p.yaxis.visible = False
p.y_range = linked_y_range
else:
linked_y_range = p.y_range
p.plot_width=160
if i < y_max:
p.xaxis.axis_label = ""
p.xaxis.visible = False
else:
p.plot_height=140
if i > 0:
p.x_range = scatter_plots[j].x_range
scatter_plots.append(p)
grid = gridplot(scatter_plots, ncols = len(dataset.columns))
show(grid)

how to draw rectangles using list in python

for line, images_files in zip(lines, image_list):
info = line.split(',')
image_index = [int(info[0])]
box_coordiante1 = [info[2]]
box_coordiante2 = [info[3]]
box_coordiante3 = [info[4]]
box_coordiante4 = [info[5]]
prev_image_num = 1
for image_number in image_index: #### read each other image_number
if prev_image_num != image_number: # if read 11111 but appear different number such as 2, 3 and ect
prev_image_num = image_number # the different number becomes pre_image_num(it was 1)
#box_coordinate = [] # empty box_coordinate
#box_coordinate.append(info[2:6])
#print box_coordinate
# box_coordinate.append() #insert 2 to 6 axis
rect = plt.Rectangle((int(box_coordiante1), int(box_coordiante2)), int(box_coordiante3), int(box_coordiante4), linewidth=1, edgecolor='r', facecolor='none')
ax.add_patch(rect)
im = cv2.imread(images_files)
im = im[:, :, (2, 1, 0)]
# # Display the image
plt.imshow(im)
plt.draw()
plt.pause(0.1)
plt.cla()
I am supposed to draw boxes on each picture.
For showing boxes on each picture,
i guess that gather location of boxes and show them at that same time.
So i used a way using LIST to plt.Rectanle
but it said "TypeError: int() argument must be a string or a number, not 'list'"
Are there other ways??
Umm, I just did just. I don't know if this is what you wanted though.
x = 10
y = 10
a = []
for unit for range(x):
a.append(0)
for unit for range(y):
print(a)
I'm not very familiar with Python, but it seems like you want a plain number in the variables image_index and box_coordinateN. It looks like you're assigning single-element arrays to them. Try changing:
image_index = [int(info[0])] // list containing one element: int(info[0])
box_coordiante1 = [info[2]]
box_coordiante2 = [info[3]]
box_coordiante3 = [info[4]]
box_coordiante4 = [info[5]]
to:
image_index = int(info[0]) // number: int(info[0])
box_coordiante1 = info[2]
box_coordiante2 = info[3]
box_coordiante3 = info[4]
box_coordiante4 = info[5]
The answer above is carelessly sloppy and incorrect Python.
It must be rewritten and corrected as follows:
x = 10
y = 10
a = []
for unit in range(x):
a.append(0)
for unit in range(y):
print(a)

Categories

Resources