I would like to reproduce the scatterplot below. Here is the code I have so far, but I cannot seem to get the points similar to the seed terms to be the same color as the filled points (seed terms). Any help is appreciated.
Also, I cannot figure out why the first word is the color white, even though I used a specific palette?
import pandas as pd
import numpy as np
import matplotlib
seed_terms = ['clean', 'recovery', 'spiral', 'tolerance', 'program']
embeddings_ex = np.random.rand(5, 10, 2)
embeddings_ex = np.array(embeddings_ex)
words_ex = [['quit', 'finally', 'pill', 'vomit', 'survive' ,'lil', 'chance' ,'chain', 'zero',
'quickly'],
['bullshit' ,'unrelated', 'everywhere', 'appear' ,'probably' ,'deal',
'mistake', 'window', 'comment', 'honest'],
['majority' ,'familiar', 'queer', 'edgy', 'skin', 'withdrawl' ,'sad', 'develop',
'perfectly', 'daughter'],
['snort', 'cheap', 'brain', 'teach' ,'shoot' ,'inject' ,'freak', 'type', 'black',
'absolute'],
['substitution', 'suboxone', 'country' ,'clinic', 'nerve', 'representation',
'2', 'website' ,'youtuber', 'insane']]
words_ex = np.array(words_ex)
fig, ax = plt.subplots(figsize=(16, 9))
sc = embeddings_ex[:, :, 0].flatten()
sw = embeddings_ex[:, :, 1].flatten()
plt.scatter(sc, sw, s=45, marker='o', alpha=0.2, color="none", edgecolors='k')
# annotate(ax, sc, sw, words, size=11)
# fill points that are seed words and make font bold
# Okabe and Ito color palette
colors = ['#FA4D4D', '#FBC93D', '#E37E3B', '#C13BE3', '#4B42FD', '#D55E00', '#CC79A7']
for i, word in enumerate(seed_terms):
plt.scatter(sc[i], sw[i], marker='o', alpha=.9,
color=colors[i], edgecolors='none', s = 100)
plt.annotate(word, alpha=.5, xy=(sc[i], sw[i]), xytext=(
5, 2), textcoords='offset points', ha='right', va='bottom', size=11)
# annotate similar words
for word in seed_terms:
# get the index of the seed word in the list of seed words
idx = seed_terms.index(word)
# get the x and y coordinates of the seed word
x = embeddings_ex[idx, :, 0].flatten()
y = embeddings_ex[idx, :, 1].flatten()
# get the list of similar words
similar_words = words_ex[idx]
# add annotations with smaller font
annotate(ax, x, y, similar_words, size=6)
# legend
plt.legend(seed_terms, loc=4)
plt.grid(False)
# remove axes and frame
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
# ticks
plt.tick_params(axis='both', which='both', bottom=False,
left=False, labelbottom=False, labelleft=False)
The word groups related to the central keyword are taken from the five arrays in a list and are about to be annotated, but since the related word groups are a list, a loop process is required to add scattering and annotation. One thing to be careful of in this method is the order in which the scatter and annotations are drawn. First we need to draw the gray scatter plot, then the scatter and annotations for the related terms, and finally the scatter and annotations for the central terms. The reason is that everything is drawn with the same coordinate data, so the hollow markers will be overwritten after the fill. The attached image controls the overlapping of the annotations, which I assume cannot be achieved with matplotlib alone, so perhaps some other tool is being introduced.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(20230115)
seed_terms = ['clean', 'recovery', 'spiral', 'tolerance', 'program']
embeddings_ex = np.random.rand(5, 10, 2)
embeddings_ex = np.array(embeddings_ex)
words_ex = [['quit', 'finally', 'pill', 'vomit', 'survive' ,'lil', 'chance' ,'chain', 'zero', 'quickly'],
['bullshit' ,'unrelated', 'everywhere', 'appear' ,'probably' ,'deal', 'mistake', 'window', 'comment', 'honest'],
['majority' ,'familiar', 'queer', 'edgy', 'skin', 'withdrawl' ,'sad', 'develop', 'perfectly', 'daughter'],
['snort', 'cheap', 'brain', 'teach' ,'shoot' ,'inject' ,'freak', 'type', 'black', 'absolute'],
['substitution', 'suboxone', 'country' ,'clinic', 'nerve', 'representation', '2', 'website' ,'youtuber', 'insane']]
words_ex = np.array(words_ex)
fig, ax = plt.subplots(figsize=(16, 9))
sc = embeddings_ex[:, :, 0].flatten()
sw = embeddings_ex[:, :, 1].flatten()
plt.scatter(sc, sw, s=45, marker='o', alpha=0.2, color="none", edgecolors='k')
# fill points that are seed words and make font bold
# Okabe and Ito color palette
colors = ['#FA4D4D', '#FBC93D', '#E37E3B', '#C13BE3', '#4B42FD', '#D55E00', '#CC79A7']
# annotate similar words
for word in seed_terms:
# get the index of the seed word in the list of seed words
idx = seed_terms.index(word)
# get the x and y coordinates of the seed word
x = embeddings_ex[idx, :, 0].flatten()
y = embeddings_ex[idx, :, 1].flatten()
# get the list of similar words
similar_words = words_ex[idx]
# add annotations with smaller font
for w,xx,yy in zip(similar_words, x,y):
plt.scatter(xx, yy, s=45, marker='o', color='white', edgecolors=colors[idx])
plt.annotate(w, xy=(xx, yy), xytext=(5,2), textcoords='offset points', ha='right',va='bottom', size=6)
for i, word in enumerate(seed_terms):
plt.scatter(sc[i], sw[i], marker='o', alpha=.9, color=colors[i], edgecolors='none', s=100, label=word)
plt.annotate(word, alpha=.5, xy=(sc[i], sw[i]), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom', size=11)
# legend
plt.legend(loc=4)
plt.grid(False)
# remove axes and frame
plt.gca().spines[:].set_visible(False)
# ticks
plt.tick_params(axis='both', which='both', bottom=False,
left=False, labelbottom=False, labelleft=False)
plt.show()
Related
I have a Python script that draws a matrix of images, each image is read from disk and is 100x100 pixels. Current result is:
matrix of images
I don't know why Python adds vertical spacing between each row. I tried setting several parameters for plt.subplots. Rendering code is below:
fig, axs = plt.subplots(
gridRows, gridCols, sharex=True, sharey=False, constrained_layout={'w_pad': 0, 'h_pad': 0, 'wspace': 0, 'hspace': 0}, figsize=(9,9)
)
k = 0
for i in range(len(axs)):
for j in range(len(axs[i])):
if (k < paramsCount and dataset.iat[k,2]):
img = mpimg.imread(<some_folder_path>)
else:
img = mpimg.imread(<some_folder_path>)
ax = axs[i, j]
ax.imshow(img)
ax.axis('off')
if (i == 0): ax.set_title(dataset.iat[k,1])
if (j == 0): ax.text(-0.2, 0.5, dataset.iat[k,0], transform=ax.transAxes, verticalalignment='center', rotation='vertical', size=12)
axi = ax.axis()
rec = plt.Rectangle((axi[0], axi[2]), axi[1] - axi[0], axi[3] - axi[2], fill=False, lw=1, linestyle="dotted")
rec = ax.add_patch(rec)
rec.set_clip_on(False)
k = k + 1
plt.show()
Desired result is like:
desired result
Does anyone have ideas?
I'm sure there are many ways to do this other than the tashi answer, but the grid and subplot keywords are used in the subplot to remove the spacing and scale. In the loop process for each subplot, I set the graph spacing, remove the tick labels, and adjust the spacing by making the border dashed and the color gray. The title and y-axis labels are also added based on the loop counter value. Since the data was not provided, some of the data is written directly, so please replace it with your own data.
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(20220510)
grid = np.random.rand(4, 4)
gridRows, gridCols = 5, 10
titles = np.arange(5,51,5)
ylabels = [500,400,300,200,100]
fig, axs = plt.subplots(gridRows, gridCols,
figsize=(8,4),
gridspec_kw={'wspace':0, 'hspace':0},
subplot_kw={'xticks': [], 'yticks': []}
)
for i, ax in enumerate(axs.flat):
ax.imshow(grid, interpolation='lanczos', cmap='viridis', aspect='auto')
ax.margins(0, 0)
if i < 10:
ax.set_title(str(titles[i]))
if i in [0,10,20,30,40]:
ax.set_ylabel(ylabels[int(i/10)])
ax.set_xticklabels([])
ax.set_yticklabels([])
for s in ['bottom','top','left','right']:
ax.spines[s].set_linestyle('dashed')
ax.spines[s].set_capstyle("butt")
for spine in ax.spines.values():
spine.set_edgecolor('gray')
plt.show()
I realized it has to do with the dimensions passed to figsize. Since rows count is half the columns count, I need to pass figsize(width, width/2).
I'm totally new at using Python for Power BI (or anything really).
I would like to add the value of the bar/scatter at the end of the line. (the datalabel)
Also to have a version where I could have the label inside of the scatter bubble would be cool.
Anyone who could help out here ?
All help appreciated
# libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Create a dataframe
df = pd.DataFrame({'group': dataset.Genre , 'values': dataset.Revenue})
val = list(dataset.SelectedGenre)
# Reorder it following the values:
ordered_df = df.sort_values(by='values')
my_range=range(1,len(df.index)+1)
# Create a color if the group is "B"
my_color=np.where(ordered_df ['group']== val, 'orange', 'skyblue')
my_size=np.where(ordered_df ['group']== val , 150, 150)
# The vertival plot is made using the hline function
# I load the seaborn library only to benefit the nice looking feature
import seaborn as sns
val = ordered_df['values']
plt.hlines(y=my_range, xmin=0, xmax=val, color=my_color, alpha=1 , linewidth=8)
plt.scatter(val, my_range, color=my_color, s=my_size, alpha=1)
# Add title and axis names
plt.yticks(my_range, ordered_df['group'])
plt.title("What about the B group?", loc='left')
plt.xlabel('Value of the variable')
plt.ylabel('Group')
plt.box(False) #Turn of Black bx around visual
plt.show()
Found it myself
import matplotlib.pyplot as plt
import numpy as np
# Data
x = dataset.Revenue
y = dataset.Genre
labels = dataset.Revenue
val = list(dataset.SelectedGenre)
# Create the figure and axes objects
fig, ax = plt.subplots(1, figsize=(10, 6))
fig.suptitle('Example Of Labelled Scatterpoints')
my_color=np.where(y == val, 'orange', 'skyblue')
my_size=np.where( y == val , 2000, 2000)
# Plot the scatter points
ax.scatter(x, y,
color= my_color, # Color of the dots
s=1000, # Size of the dots
alpha=1, # Alpha of the dots
linewidths=1) # Size of edge around the dots
ax.hlines(y, xmin=0, xmax=x, color= my_color, alpha=1 , linewidth=8)
def human_format(num):
magnitude = 0
while abs(num) >= 1000:
magnitude += 1
num /= 1000
# add more suffixes if you need them
return '%.0f%s' % (round(num), ['', 'K', 'M', 'G', 'T', 'P'][magnitude])
# Add the participant names as text labels for each point
for x_pos, y_pos, label in zip(x, y, labels):
ax.annotate(
human_format(label), # The label for this point
xy=(x_pos, y_pos), # Position of the corresponding point
xytext=(-8, 0), # Offset text by 7 points to the right
textcoords='offset points', # tell it to use offset points
ha='left', # Horizontally aligned to the left
va='center',
color = 'white') # Vertical alignment is centered
plt.box(False) #Turn of Black bx around visual
# Show the plot
plt.show()
Here is the code:
import itertools
import pandas as pd
import matplotlib.pyplot as plt
# reuse these colors
colors = itertools.cycle(["r", "b", "g"])
# some random data
df = pd.DataFrame({'x':[1,2,3,4,5],
'y':[2,4,5,2,4],
'area': [100, 200, 400, 500, 800],
'label': ['blah1','blah2','blah3','blah4','blah5']
})
# draw a scatter plot
def draw_scatter_plot(
x,
y,
marker_size,
marker_color: itertools.cycle,
labels
):
fig, ax = plt.subplots(figsize=(12, 8))
if marker_size:
i = 0
while i<len(x):
ax.scatter(x[i], y[i], color = next(marker_color), s = marker_size[i])
ax.annotate(
labels[i],
(x[i], y[i]), # adjust y[i] here
fontproperties=cur_font,
fontsize=14,
ha="center",
va="top",
)
i+=1
plt.show()
draw_scatter_plot(df.x.tolist(),
df.y.tolist(),
df.area.tolist(),
colors,
df.label.tolist())
Here is the result:
As you can see the labels overlap with the bottom of the circle. How can I calculate the bottom y value of the circle so that I can always position the labels such that they do not overlap with the circles?
The idea would be to shift the text by half the diameter of the scatter points, np.sqrt(marker_size[i])/2.. You might then add an additional 2 points such that the text does not touch the markers.
ax.annotate(labels[i],
xy=(x[i], y[i]),
xytext=(0, -np.sqrt(marker_size[i])/2. - 2),
textcoords="offset points",
ha="center",
va="top")
One solution is to use textcoords as shown in this answer. You can turn off the fancy boxes in the original answer.
ax.annotate(labels[i],(x[i], y[i]), xytext=(0, -15),
textcoords='offset points', fontsize=14,ha="center",va="top",)
You can change your annotation as follows:
[plt.annotate("{}, {}".format(x,y), xy=(x,y), xytext=(x-.1,y-.16)) for x, y in zip(x, y)]
Using this, you have to scale your yxtext coordinates depending on the scaling of your x,y axis.
One more comment: because you using it in a while loop, you might want to change this to:
plt.annotate("{}, {}".format(x[i],y[i]), xy=(x[i],y[i]), xytext=(x[i]-.1,y[i]-.16))
I have this plot in which some areas between curves are being filled by definition. Is there any way to include them in legend? Especially where those filled areas are overlapped and as well as that a new and different color is being appeared.
Or there is possibility to define an arbitrary legend regardless of the curves' data?
Using fill_bettween to plot your data will automatically include the filled area in the legend.
To include the areas where the two datasets overlap, you can combine the legend handles from both dataset into a single legend handle.
As pointed out in the comments, you can also define any arbitrary legend handle with a proxy.
Finally, you can define exactly what handles and labels you want to appear in the legend, regardless of the data plotted in your graph.
See the MWE below that illustrates the points stated above:
import matplotlib.pyplot as plt
import numpy as np
plt.close('all')
# Gererate some datas:
x = np.random.rand(50)
y = np.arange(len(x))
# Plot data:
fig, ax = plt.subplots(figsize=(11, 4))
fillA = ax.fill_between(y, x-0.25, 0.5, color='darkolivegreen', alpha=0.65, lw=0)
fillB = ax.fill_between(y, x, 0.5, color='indianred', alpha=0.75, lw=0)
linec, = ax.plot(y, np.zeros(len(y))+0.5, color='blue', lw=1.5)
linea, = ax.plot(y, x, color='orange', lw=1.5)
lineb, = ax.plot(y, x-0.25, color='black', lw=1.5)
# Define an arbitrary legend handle with a proxy:
rec1 = plt.Rectangle((0, 0), 1, 1, fc='blue', lw=0, alpha=0.25)
# Generate the legend:
handles = [linea, lineb, linec, fillA, fillB, (fillA, fillB),
rec1, (fillA, fillB, rec1)]
labels = ['a', 'b', 'c', 'A', 'B', 'A+B', 'C', 'A+B+C']
ax.legend(handles, labels, loc=2, ncol=4)
ax.axis(ymin=-1, ymax=2)
plt.show()
Yes, you are absolutely right ian_itor, tacaswell and Jean-Sébastien, user defined legend seems to be the unique solution, in addition I made different linewidth for those area to be distinguishable from the curves, and playing with alpha got the right color.
handles, labels = ax.get_legend_handles_labels()
display = (0,1,2,3,4)
overlap_1 = plt.Line2D((0,1),(0,0), color='firebrick', linestyle='-',linewidth=15, alpha = 0.85)
overlap_2= plt.Line2D((0,1),(0,0), color='darkolivegreen',linestyle='-',linewidth=15, alpha = 0.65)
over_lo_3= plt.Line2D((0,1),(0,0), color='indianred',linestyle='-',linewidth=15, alpha = 0.75)
ax.legend([handle for i,handle in enumerate(handles) if i in display]+[overlap_1 , overlap_2 , overlap_3 ],
[label for i,label in enumerate(labels) if i in display]+['D','F','G'])
I am using python to plot and my codes are:
import matplotlib.pyplot as plt
import numpy as np
# these are the data to be plot
x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14]
x_test = ['grid50', 'grid100', 'grid150', 'grid250', 'grid500', 'grid750', 'NN5', 'NN10', 'NN15', 'NN20', 'NN50', 'NN100', 'CB', 'CBG']
clf = [0.58502, 0.60799, 0.60342, 0.59629, 0.56464, 0.53757, 0.62567, 0.63429, 0.63583, 0.63239, 0.63315, 0.63156, 0.60630, 0.52755]
hitrate = [0.80544, 0.89422, 0.94029, 0.98379, 0.99413, 0.99921, 0.99478, 0.99961, 0.99997, 0.99980, 0.99899, 0.99991, 0.88435, 1.0]
level = [23.04527, 9.90955, 4.35757, 1.46438, 0.51277, 0.15071, 1.30057, 0.00016, 0.00001, 0.00021, 0.00005, 0.00004, 6.38019, 0]
fig = plt.figure(figsize=(20,7))
ax = fig.add_subplot(111)
fig.subplots_adjust(right=0.8)
# this is the function to put annotation on bars
def autolabel(rects):
# attach some text labels
for ii,rect in enumerate(rects):
height = rect.get_height()
plt. text(rect.get_x()+rect.get_width()/2., 1.02*height, '%s'% (clf[ii]),ha='center', va='bottom')
plt.xticks(x,x_test)
# this part is to plot the red bar charts
ins1 = ax.bar(x,clf,color='Red', align='center',label='classification results')
ax.set_ylabel('classification results', color='Red')
ax.tick_params(axis='y',colors='Red')
ax.set_ylim(0,1.5)
autolabel(ins1)
# this part is to plot the green hitrate and the for-loop is to put annotation next to the line
ax2 = ax.twinx()
ins2, = ax2.plot(x,hitrate,marker='o',color='Green', linewidth=3.0, label='hitrate')
ax2.set_ylabel('hitrate', color='Green')
ax2.tick_params(axis='y',colors='Green')
ax2.set_ylim(0,1.5)
for i,j in zip(x, hitrate):
ax2.annotate(str(j),xy=(i,j+0.02))
# this part is to plot the blue level, forloop same as that of hitrate
ax3 = ax.twinx()
axes = [ax, ax2, ax3]
ax3.spines['right'].set_position(('axes', 1.1))
ax3.set_frame_on(True)
ax3.patch.set_visible(False)
ins3, = ax3.plot(x,level,marker='^', color='Blue', linewidth=3.0, label='obfuscation level')
ax3.set_ylabel('obfuscation level', color='Blue')
ax3.tick_params(axis='y',colors='Blue')
ax3.set_ylim(0,25)
for i,j in zip(x, level):
ax3.annotate(str(j),xy=(i,j+0.02))
ax.set_xlabel('Cell Configurations')
ax.set_xlim(0,15)
ax.set_title('benchmark')
ax.legend([ins1,ins2,ins3],['clf', 'hit', 'level'])
plt.grid()
plt.show()
And I got a figure like :
The problem is that, some numbers are not put in a good place so to be read clearly, but I don't know whether there is a method to put the annotation naturally at a blank area. Any ideas?