Changing order of items in combined sns python graph - python

I need a plot graph with three different axes. So far, I adapted one script I have found on the web. Unfortunately, I have noticed that my data are not ordered properly. At first, I sort my dataframe by a column named 'a', but in the final figure, only this column seems to be sorted. I would like to order all of them. When I print dataframe after sorting everything seems to be fine.
I will really appreciate any help.
Here is my dataframe after I have sorted it based on column 'a' and here is final graph, where only area of catchments is sorted, but names of catchments, mean elevation and mean slope of catchments are not sorted properly.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from matplotlib.ticker import PercentFormatter
data1 = 'path to my .xlsx file'
df = pd.read_excel(data1, 'catchments_basic')# loading data
df_sorted = df.sort_values(by=['a'], ascending=False)
def make_patch_spines_invisible(ax):
ax.set_frame_on(True)
ax.patch.set_visible(False)
for sp in ax.spines.values():
sp.set_visible(False)
sns.set(style="white", rc={"lines.linewidth": 3})
fig, ax = plt.subplots(figsize=(15,10))
fig.subplots_adjust(right=0.75)
ax1 = ax.twinx()
ax2 = ax.twinx()
# Offset the right spine of par2. The ticks and label have already been
# placed on the right by twinx above.
ax2.spines["right"].set_position(("axes", 1.1))
# Having been created by twinx, par2 has its frame off, so the line of its
# detached spine is invisible. First, activate the frame but make the patch
# and spines invisible.
make_patch_spines_invisible(ax2)
# Second, show the right spine.
ax2.spines["right"].set_visible(True)
host = sns.barplot(x=df_sorted['Catchment'],
y=df_sorted["a"],
color='#004488',
label="area",
ax=ax)
par1 = sns.lineplot(x=df_sorted['Catchment'],
y=df_sorted["b"],
color='r',
marker="o",
label="mean elevation",
ax=ax1)
par2 = sns.lineplot(x=df_sorted['Catchment'],
y=df_sorted["c"],
color='g',
marker="o",
label="mean slope",
ax=ax2)
host.set_xlim(-1, 20)
host.set_ylim(0, 1000)
par1.set_ylim(0, 1000)
par2.set_ylim(0, 100)
host.set_xlabel("river catchment")
host.set_ylabel("area [$km^2$]")
par1.set_ylabel("mean elevation [m n. m.]")
par2.set_ylabel("mean slope [%]")
host.yaxis.label.set_color(color='#004488')
par1.yaxis.label.set_color(color='r')
par2.yaxis.label.set_color(color='g')
tkw = dict(size=4, width=1.5)
host.tick_params(axis='y', colors='#004488', **tkw)
host.tick_params(axis='x', colors='black', **tkw)
par1.tick_params(axis='y', colors='r', **tkw)
par2.tick_params(axis='y', colors='g', **tkw)
host.tick_params(axis='povodie', **tkw)
ax2.yaxis.set_major_formatter(PercentFormatter(decimals=0))
for tick in host.get_xticklabels():
tick.set_rotation(45)
host.set_title('Area, mean altitude and mean slope in selected river catchments', family='Arial', size=12, weight='bold')
host.grid(linestyle="dotted", color='black')
host.legend(loc='upper left')
par1.legend(loc='upper center')
par2.legend(loc='upper right')
save_results_to = 'path where I want to save figure
plt.tight_layout(pad=2)
plt.savefig(save_results_to + 'basic_characteristics_bar_line_combination.png', dpi = 300)
plt.show()
print ('done')

you should change sort parameter in sns.lineplot to False

Related

Removing legend from mpl parallel coordinates plot?

I have a parallel coordinates plot with lots of data points so I'm trying to use a continuous colour bar to represent that, which I think I have worked out. However, I haven't been able to remove the default key that is put in when creating the plot, which is very long and hinders readability. Is there a way to remove this table to make the graph much easier to read?
This is the code I'm currently using to generate the parallel coordinates plot:
parallel_coordinates(data[[' male_le','
female_le','diet','activity','obese_perc','median_income']],'median_income',colormap = 'rainbow',
alpha = 0.5)
fig, ax = plt.subplots(figsize=(6, 1))
fig.subplots_adjust(bottom=0.5)
cmap = mpl.cm.rainbow
bounds = [0.00,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
norm = mpl.colors.BoundaryNorm(bounds, cmap.N,)
plt.colorbar(mpl.cm.ScalarMappable(norm = norm, cmap=cmap),cax = ax, orientation = 'horizontal',
label = 'normalised median income', alpha = 0.5)
plt.show()
Current Output:
I want my legend to be represented as a color bar, like this:
Any help would be greatly appreciated. Thanks.
You can use ax.legend_.remove() to remove the legend.
The cax parameter of plt.colorbar indicates the subplot where to put the colorbar. If you leave it out, matplotlib will create a new subplot, "stealing" space from the current subplot (subplots are often referenced to by ax in matplotlib). So, here leaving out cax (adding ax=ax isn't necessary, as here ax is the current subplot) will create the desired colorbar.
The code below uses seaborn's penguin dataset to create a standalone example.
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import numpy as np
from pandas.plotting import parallel_coordinates
penguins = sns.load_dataset('penguins')
fig, ax = plt.subplots(figsize=(10, 4))
cmap = plt.get_cmap('rainbow')
bounds = np.arange(penguins['body_mass_g'].min(), penguins['body_mass_g'].max() + 200, 200)
norm = mpl.colors.BoundaryNorm(bounds, 256)
penguins = penguins.dropna(subset=['body_mass_g'])
parallel_coordinates(penguins[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']],
'body_mass_g', colormap=cmap, alpha=0.5, ax=ax)
ax.legend_.remove()
plt.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=ax, orientation='horizontal', label='body mass', alpha=0.5)
plt.show()

How to properly plot a line over bars?

This one used to work fine, but somehow it stopped working (I must have changed something mistakenly but I can't find the issue).
I'm plotting a set of 3 bars per date, plus a line that shows the accumulated value of one of them. But only one or another (either the bars or the line) is properly being plotted. If I left the code for the bars last, only the bars are plotted. If I left the code for the line last, only the line is plotted.
fig, ax = plt.subplots(figsize = (15,8))
df.groupby("date")["result"].sum().cumsum().plot(
ax=ax,
marker='D',
lw=2,
color="purple")
df.groupby("date")[selected_columns].sum().plot(
ax=ax,
kind="bar",
color=["blue", "red", "gold"])
ax.legend(["LINE", "X", "Y", "Z"])
Appreciate the help!
Pandas draws bar plots with the x-axis as categorical, so internally numbered 0, 1, 2, ... and then setting the label. The line plot uses dates as x-axis. To combine them, both need to be categorical. The easiest way is to drop the index from the line plot. Make sure that the line plot is draw first, enabling the labels to be set correctly by the bar plot.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df = pd.DataFrame({'date': pd.date_range('20210101', periods=10),
'earnings': np.random.randint(100, 600, 10),
'costs': np.random.randint(0, 200, 10)})
df['result'] = df['earnings'] - df['costs']
fig, ax = plt.subplots(figsize=(15, 8))
df.groupby("date")["result"].sum().cumsum().reset_index(drop=True).plot(
ax=ax,
marker='D',
lw=2,
color="purple")
df.groupby("date")[['earnings', 'costs', 'result']].sum().plot(
ax=ax,
kind="bar",
rot=0,
width=0.8,
color=["blue", "red", "gold"])
ax.legend(['Cumul.result', 'earnings', 'costs', 'result'])
# shorten the tick labels to only the date
ax.set_xticklabels([tick.get_text()[:10] for tick in ax.get_xticklabels()])
ax.set_ylim(ymin=0) # bar plots are nicer when bars start at zero
plt.tight_layout()
plt.show()
Here I post the solution:
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
a=[11.3,222,22, 63.8,9]
b=[0.12,-1.0,1.82,16.67,6.67]
l=[i for i in range(5)]
plt.rcParams['font.sans-serif']=['SimHei']
fmt='%.1f%%'
yticks = mtick.FormatStrFormatter(fmt)
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.plot(l, b,'og-',label=u'A')
ax1.yaxis.set_major_formatter(yticks)
for i,(_x,_y) in enumerate(zip(l,b)):
plt.text(_x,_y,b[i],color='black',fontsize=8,)
ax1.legend(loc=1)
ax1.set_ylim([-20, 30])
ax1.set_ylabel('ylabel')
plt.legend(prop={'family':'SimHei','size':8})
ax2 = ax1.twinx()
plt.bar(l,a,alpha=0.1,color='blue',label=u'label')
ax2.legend(loc=2)
plt.legend(prop={'family':'SimHei','size':8},loc="upper left")
plt.show()
The key to this is the command
ax2 = ax1.twinx()

Matplotlib line graph of pandas dataframe with double y axis scale and datetime on x axis

I have a log which describes my home ADSL speeds.
Log entries are in the following format, where the fields are datetime;level;downspeed;upspeed;testhost:
2020-01-06 18:09:45;INFO;211.5;29.1;0;host:spd-pub-rm-01-01.fastwebnet.it
2020-01-06 18:14:39;WARNING;209.9;28.1;0;host:spd-pub-rm-01-01.fastwebnet.it
2020-01-08 10:51:27;INFO;211.6;29.4;0;host:spd-pub-rm-01-01.fastwebnet.it
(for a full sample file -> https://www.dropbox.com/s/tfmj9ozxe5millx/test.log?dl=0 for you to download for the code below)
I wish to plot a matplot figure with the download speeds on the left axis, the upload speeds (which are on a smaller and lower range of values) and have the shortened datetimes under the x tick marks possibly at 45 degrees angle.
"""Plots the adsl-log generated log."""
import matplotlib.pyplot as plt
# import matplotlib.dates as mdates
import pandas as pd
# set field delimiter and set column names which will also cause reading from row 1
data = pd.read_csv("test.log", sep=';', names=[
'datetime', 'severity', 'down', 'up', 'loss', 'server'])
# we need to filter out ERROR records (with 0 speeds)
indexNames = data[data['severity'] == 'ERROR'].index
data.drop(indexNames, inplace=True)
# convert datetime pandas objecti to datetime64
data['datetime'] = pd.to_datetime(data['datetime'])
# use a dataframe with just the data I need; cleaner
speeds_df = data[['datetime', 'down', 'up']]
speeds_df.info() # this shows datetime column is really a datetime64 value now
# now let's plot
fig, ax = plt.subplots()
y1 = speeds_df.plot(ax=ax, x='datetime', y='down', grid=True, label="DL", legend=True, linewidth=2,ylim=(100,225))
y2 = speeds_df.plot(ax=ax, x='datetime', y='up', secondary_y=True, label="UL", legend=True, linewidth=2, ylim=(100,225))
plt.show()
I am now obtaining the plot I need but would appreciate some clarification about the roles of the ax, y1 and y2 axes in the above code.
First, assigning y1 and y2 objects is unnecessary as you will never use them later on. Also, legend=True is the default.
Per matplotlib.pyplot.subplots docs, the return of ax is:
ax : axes.Axes object or array of Axes objects
Per pandas.DataFrame.plot, the ax argument:
ax : matplotlib axes object, default None
Therefore, you are first initializing an array of axes objects (defaulting to one item, nrow=1 and nrow=2), and then assigning it/them according to the pandas plots. Now, normally, you would be overwriting the assignment of ax with ax=ax, but since you employ a secondary y-axis, plots overlay with each other:
# INITIALIZE FIG DIMENSION AND AXES OBJECTS
fig, axs = plt.subplots(figsize=(8,4))
# ASSIGN AXES OBJECTS ACCORDINGLY
speeds_df.plot(ax=axs, x='datetime', y='down', grid=True, label="DL", linewidth=2, ylim=(100,225))
speeds_df.plot(ax=axs, x='datetime', y='up', secondary_y=True, label="UL", linewidth=2, ylim=(100,225))
plt.show()
To illustrate how axes objects can be extended, see below with multiple (non-overlaid) plots.
Example of multiple subplots using nrows=2:
# INITIALIZE FIG DIMENSION AND AXES OBJECTS
fig, axs = plt.subplots(nrows=2, figsize=(8,4))
# ASSIGN AXES OBJECTS WITH INDEXING AND NO Y LIMITS
speeds_df.plot(ax=axs[0], x='datetime', y='down', grid=True, label="DL", linewidth=2)
plt.subplots_adjust(hspace = 1)
speeds_df.plot(ax=axs[1], x='datetime', y='up', label="UL", linewidth=2)
plt.show()
Example of multiple plots using ncols=2:
# INITIALIZE FIG DIMENSION AND AXES OBJECTS
fig, axs = plt.subplots(ncols=2, figsize=(12,4))
# ASSIGN AXES OBJECTS WITH INDEXING AND NO Y LIMITS
speeds_df.plot(ax=axs[0], x='datetime', y='down', grid=True, label="DL", linewidth=2)
speeds_df.plot(ax=axs[1], x='datetime', y='up', label="UL", linewidth=2)
plt.show()
You can even use subplots=True after setting date/time field as index:
# INITIALIZE FIG DIMENSION AND AXES OBJECTS
fig, axs = plt.subplots(figsize=(8,4))
# ASSIGN AXES OBJECT PLOTTING ALL COLUMNS
speeds_df.set_index('datetime').plot(ax=axs, subplots=True, grid=True, label="DL", linewidth=2)
plt.show()
So thanks to #Parfait I hope I understood things correctly. Here the working code:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
###### Prepare the data to plot
# set field delimiter and set column names which will also cause reading from row 1
data = pd.read_csv('test.log', sep=';', names=[
'datetime', 'severity', 'down', 'up', 'loss', 'server'])
# we need to filter out ERROR records (with 0 speeds)
indexNames = data[data['severity'] == 'ERROR'].index
data.drop(indexNames, inplace=True)
# convert datetime pandas object to datetime64
data['datetime'] = pd.to_datetime(data['datetime'])
# use a dataframe with just the data I need; cleaner
speeds_df = data[['datetime', 'down', 'up']]
# now plot the graph
fig, ax = plt.subplots()
color = 'tab:green'
ax.set_xlabel('thislabeldoesnotworkbutcolordoes', color=color)
ax.tick_params(axis='x', labelcolor=color)
color = 'tab:red'
speeds_df.plot(ax=ax, x='datetime', y='down', label="DL", legend=True, linewidth=2, color=color)
ax.set_ylabel('DL', color=color)
ax.tick_params(axis='y', labelcolor=color)
color = 'tab:blue'
ax2 = speeds_df.plot(ax=ax, x='datetime', y='up', secondary_y=True, label="UL", legend=True, linewidth=2, color=color)
ax2.set_ylabel('UL', color=color)
ax2.tick_params(axis='y', labelcolor=color)
# using ylim in the plot command params does not work the same
# cannot show a grid since the two scales are different
ax.set_ylim(10, 225)
ax2.set_ylim(15, 50)
plt.show()
Which gives:
What I still don't get is:
a) why the x-axis label only seems to honour the color but not the string value :(
b) why the ylim=(n,m) parameters in the df plot does not work well and I have to use the ax.set_ylim constructs instead

Seaborn scatterplot legend showing true values and normalized continuous color

I have a dataframe that I'd like to use to build a scatterplot where different points have different colors:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
dat=pd.DataFrame(np.random.rand(20, 2), columns=['x','y'])
dat['c']=np.random.randint(0,100,20)
dat['c_norm']=(dat['c']-dat['c'].min())/(dat['c'].max()-dat['c'].min())
dat['group']=np.append(np.repeat('high',10), np.repeat('low',10))
As you can see, the column c_norm shows the c column has been normalized between 0 and 1. I would like to show a continuous legend whose color range reflect the normalized values, but labeled using the original c values as label. Say, the minimum (1), the maximum (86), and the median (49). I also want to have differing markers depending on group.
So far I was able to do this:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
for row in dat.index:
if(dat.loc[row,'group']=='low'):
i_marker='.'
else:
i_marker='x'
ax.scatter(
x=dat.loc[row,'x'],
y=dat.loc[row,'y'],
s=50, alpha=0.5,
marker=i_marker
)
ax.legend(dat['c_norm'], loc='center right', bbox_to_anchor=(1.5, 0.5), ncol=1)
Questions:
- How to generate a continuous legend based on the values?
- How to adapt its ticks to show the original ticks in c, or at least a min, max, and mean or median?
Thanks in advance
Partial answer. Do you actually need to determine your marker colors based on the normed values? See the output of the snippet below.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dat = pd.DataFrame(np.random.rand(20, 2), columns=['x', 'y'])
dat['c'] = np.random.randint(0, 100, 20)
dat['c_norm'] = (dat['c'] - dat['c'].min()) / (dat['c'].max() - dat['c'].min())
dat['group'] = np.append(np.repeat('high', 10), np.repeat('low', 10))
fig, (ax, bx) = plt.subplots(nrows=1, ncols=2, num=0, figsize=(16, 8))
mask = dat['group'] == 'low'
scat = ax.scatter(dat['x'][mask], dat['y'][mask], s=50, c=dat['c'][mask],
marker='s', vmin=np.amin(dat['c']), vmax=np.amax(dat['c']),
cmap='plasma')
ax.scatter(dat['x'][~mask], dat['y'][~mask], s=50, c=dat['c'][~mask],
marker='X', vmin=np.amin(dat['c']), vmax=np.amax(dat['c']),
cmap='plasma')
cbar = fig.colorbar(scat, ax=ax)
scat = bx.scatter(dat['x'][mask], dat['y'][mask], s=50, c=dat['c_norm'][mask],
marker='s', vmin=np.amin(dat['c_norm']),
vmax=np.amax(dat['c_norm']), cmap='plasma')
bx.scatter(dat['x'][~mask], dat['y'][~mask], s=50, c=dat['c_norm'][~mask],
marker='X', vmin=np.amin(dat['c_norm']),
vmax=np.amax(dat['c_norm']), cmap='plasma')
cbar2 = fig.colorbar(scat, ax=bx)
plt.show()
You could definitely modify the second colorbar so that it matches the first one, but is that necessary?

Adjust y-axis in Seaborn multiplot

I'm plotting a CSV file from my simulation results. The plot has three graphs in the same figure fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(24, 6)).
However, for comparison purposes I want the y-axis in all graphs starting at zero and the ending at a specific value. I tried the solution mentioned here from the Seaborn author. I don't get any errors, but the solution also does not work for me.
Here's my script:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
fname = 'results/filename.csv'
def plot_file():
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(24, 6))
df = pd.read_csv(fname, sep='\t')
profits = \
df.groupby(['providerId', 'periods'], as_index=False)['profits'].sum()
# y-axis needs to start at zero and end at 10
g = sns.lineplot(x='periods',
y='profits',
data=profits,
hue='providerId',
legend='full',
ax=axes[0])
# y-axis need to start at zero and end at one
g = sns.scatterplot(x='periods',
y='price',
hue='providerId',
style='providerId',
data=df,
legend=False,
ax=axes[1])
# y-axis need to start at zero and end at one
g = sns.scatterplot(x='periods',
y='quality',
hue='providerId',
style='providerId',
data=df,
legend=False,
ax=axes[2])
g.set(ylim=(0, None))
plt.show()
print(g) # -> AxesSubplot(0.672059,0.11;0.227941x0.77)
The resulting figure is as follows:
How can I adjust each individual plot?
Based on the way you've written your code, you can refer to each subplot axis with g.axis and use g.axis.set_ylim(low,high). (A difference compared to the linked answer is that your graphs are not being plotted on a seaborn FacetGrid.)
An example using dummy data and different axis ranges to illustrate:
df = pd.DataFrame(np.random.uniform(0,10,(100,2)), columns=['a','b'])
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(8,4))
g = sns.lineplot(x='a',
y='b',
data=df.sample(10),
ax=axes[0])
g.axes.set_ylim(0,25)
g = sns.scatterplot(x='a',
y='b',
data=df.sample(10),
ax=axes[1])
g.axes.set_ylim(0,3.5)
g = sns.scatterplot(x='a',
y='b',
data=df.sample(10),
ax=axes[2])
g.axes.set_ylim(0,0.3)
plt.tight_layout()
plt.show()

Categories

Resources