Add standard deviation bars from existing std column in data? - python

I'm plotting a chart based on the following data (head only):
Date noctiluca_ave density_ave density_sd
0 2018-03-07 2.0 1027.514332 0.091766
1 2018-03-14 4.0 1027.339988 0.285309
2 2018-03-21 1.0 1027.346413 0.183336
3 2018-03-31 1.0 1027.372996 0.170423
4 2018-04-07 0.0 1027.292119 0.187385
How do I add standard deviation ('density_sd') bars to the density_ave line?
fig, ax = plt.subplots(figsize=(10, 10))
ax.plot(hydro_weekly2 ['Date'], hydro_weekly2 ['density_ave'], label='density weekly ave', color='purple')
ax2=ax.twinx()
ax2.plot(hydro_weekly2['Date'], hydro_weekly2['noctiluca_ave'], label='noctiluca abundance' , color='r')
ax.set_ylabel('Density')
ax.set_xlabel('Date')
ax2.set_ylabel('Noctiluca abundance/cells per m3')
ax.set(title="Noctiluca Abundance and Density 2018")
lines, labels = ax.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc="upper left")

You could either replace ax.plot with ax.errorbar() or use ax.fill_between() to show a colored band.
Here is an example with toy data and both approaches combined:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
N = 20
dates = pd.date_range('2018-03-07', periods=N, freq='W-WED')
hydro_weekly2 = pd.DataFrame({'Date': dates,
'noctiluca_ave': np.random.randint(0, 14000, N),
'density_ave': 1027 + np.random.randn(N).cumsum() / 5,
'density_sd': 0.1 + np.abs(np.random.randn(N) / 5)})
fig, ax = plt.subplots(figsize=(10, 10))
ax.errorbar(hydro_weekly2['Date'], hydro_weekly2['density_ave'], yerr=hydro_weekly2['density_sd'],
label='density weekly ave', color='purple')
ax.fill_between(hydro_weekly2['Date'], hydro_weekly2['density_ave'] - hydro_weekly2['density_sd'],
hydro_weekly2['density_ave'] + hydro_weekly2['density_sd'], color='purple', alpha=0.3)
ax2 = ax.twinx()
ax2.plot(hydro_weekly2['Date'], hydro_weekly2['noctiluca_ave'], label='noctiluca abundance', color='r')
ax.set_ylabel('Density')
ax.set_xlabel('Date')
ax2.set_ylabel('Noctiluca abundance/cells per m3')
ax.set(title="Noctiluca Abundance and Density 2018")
plt.show()

Since, you are dealing with dataframe. Here is another way to show density_sd bars with pandas plotting. The method works fine for barplot as well.
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('test.csv',parse_dates=['Date'],index_col=0)
ax = df['density_ave'].plot(yerr=df['density_sd'].T)
df['noctiluca_ave'].plot(yerr=df['density_sd'].T,secondary_y=True, ax=ax)
## added errorbar in the secondary y as well.
plt.show()
Output :

Related

Aligning x axis numerical values with corresponding categorical variables in seaborn on twinx

I am trying to plot a dual x-axis seaborn linegraph but the output based on below code shows that both x-axis are stuck in the corner rather than aligning the values with each other.
What am I missing?
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use("fivethirtyeight")
fig, ax = plt.subplots(1,1,figsize=(10,10))
ax2 = ax.twinx()
sns.lineplot(x="BUCKET_SEGMENT_1", y= 'Percentage to Bucket Total', data=df, ax=ax)
sns.lineplot(x="PAST_DUE_DAYS", y= 'Percentage to Bucket Total',data=df, ax=ax2)
plt.show()
DATA
BUCKET_SEGMENT_1 PAST_DUE_DAYS BAL Percentage to Bucket Total
0 Bucket1 3.0 878698.045 74.431434
1 Bucket1 4.0 25747.397 2.180971
2 Bucket2 6.0 171683.523 14.54271
3 Bucket2 7.0 55659.448 4.714716
4 Bucket3 8.0 1589.759 0.134662
Here's an example of what I would like to see
I also tried to use ax and ax2 set limits and xticks but couldn't get far
DF = df.copy()
DF.set_index("BUCKET_SEGMENT_1",inplace=True)
fig, ax = plt.subplots(1,1,figsize=(10,10))
ax2 = ax.twinx()
sns.lineplot(data=DF,x="PAST_DUE_DAYS",y='Percentage to Bucket Total',ax=ax)
sns.lineplot(data=DF.reset_index(),x='BUCKET_SEGMENT_1',y='Percentage to Bucket Total',ax=ax2)
x = int(max(DF.PAST_DUE_DAYS))
ax.set_xlim(0,x)
ax2.set_xticks(DF.index.unique())
To find out what is going on, it helps to first plot both graphs in separate subplots:
import matplotlib.pyplot as plt
import pandas as pd
from io import StringIO
df_str = '''BUCKET_SEGMENT_1 PAST_DUE_DAYS BAL "Percentage to Bucket Total"
0 Bucket1 3.0 878698.045 74.431434
1 Bucket1 4.0 25747.397 2.180971
2 Bucket2 6.0 171683.523 14.54271
3 Bucket2 7.0 55659.448 4.714716
4 Bucket3 8.0 1589.759 0.134662'''
df = pd.read_csv(StringIO(df_str), delim_whitespace=True)
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(14, 5))
sns.lineplot(x="BUCKET_SEGMENT_1", y='Percentage to Bucket Total', data=df, ax=ax1)
sns.lineplot(x="PAST_DUE_DAYS", y='Percentage to Bucket Total', data=df, ax=ax2)
ax2.set_xticks(df["PAST_DUE_DAYS"])
plt.tight_layout()
plt.show()
The left subplot uses the categorical bucket names for the x-axis. All percentages are averaged, and an error band is shown. Bucket1, Bucket2 and Bucket3, are at internal positions 0, 1 and 2.
The right subplot uses the numerical PAST_DUE_DAYS positions for the x-axis. There is only one percentage value per day, so no error band is needed.
ax.twinx() will share the x axes. This will mix the first subplot positions (0,1,2) with the numerical positions of the second subplot (3,4,6,7,8). The result is the weird plot you see (clearly from a different dataframe as the example).
Now, to mark the buckets into the numerical plot, you could use groupby() to find out the positions. With the minimum and maximum days, a text could be positioned. Also, alternating colored bands could visualize the ranges.
Here is some example code to get you started:
fig, ax = plt.subplots(figsize=(10, 5))
sns.lineplot(x="PAST_DUE_DAYS", y='Percentage to Bucket Total', data=df, ax=ax)
colors = ['red', 'green']
previous_max_days = 0
for (group_name, group_data), color in zip(df.groupby("BUCKET_SEGMENT_1"), colors * len(df)):
# min_days = group_data['PAST_DUE_DAYS'].min() - 0.5
min_days = previous_max_days
max_days = group_data['PAST_DUE_DAYS'].max() + 0.5
ax.axvspan(min_days, max_days, color=color, alpha=0.1)
ax.text((min_days + max_days) / 2, 0.9, group_name, ha='center', fontsize=20, color=color,
transform=ax.get_xaxis_transform())
previous_max_days = max_days
plt.show()

Add different shade colors for trend and forecast , with text on the region

import numpy as np
import pandas as pd
df = pd.DataFrame({"y" : np.random.rand(20)})
ax = df.iloc[:15,:].plot(ls="-", color="b")
ax2 = ax.twinx() #Create a twin Axes sharing the xaxis
df.iloc[15:,:].plot(ls="--", color="r", ax=ax)
plt.axhline(y=0.5,linestyle="--",animated=True,label="False Alaram")
plt.show()
So, first 15 are trend and last 5 are predictions.
I want different colors for trend and pred in background.
Also, how can i add text "Historic" and "Forecast" on graph.
I believe you're looking for fill_between:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({"y" : np.random.rand(20)})
fig, ax = plt.subplots(figsize=(8,6))
df.iloc[:15,:].plot(ls="-", color="b", ax=ax)
plt.fill_between(df.iloc[:15].index.tolist(), df.iloc[:15].y.tolist(), alpha=.25, color='b')
df.iloc[15:,:].plot(ls="--", color="r", ax=ax)
plt.axhline(y=0.5,linestyle="--", animated=True, label="False Alaram")
plt.fill_between(df.iloc[15:].index.tolist(), df.iloc[15:].y.tolist(), alpha=.25, color='r')
plt.legend()
plt.show()

Difficulty aligning xticks to edge of Histogram bin

I am trying to show the frequency of my data throughout the hours of the day, using a histogram, in 3 hour intervals. I therefore use 8 bins.
plt.style.use('seaborn-colorblind')
plt.figure(figsize=(10,5))
plt.hist(comments19['comment_hour'], bins = 8, alpha = 1, align='mid', edgecolor = 'white', label = '2019', density=True)
plt.title('2019 comments, 8 bins')
plt.xticks([0,3,6,9,12,15,18,21,24])
plt.xlabel('Hours of Day')
plt.ylabel('Relative Frequency')
plt.tight_layout()
plt.legend()
plt.show()
However, the ticks are not aligning with the bin edges, as seen from the image below.
You can do either:
plt.figure(figsize=(10,5))
# define the bin and pass to plt.hist
bins = [0,3,6,9,12,15,18,21,24]
plt.hist(comments19['comment_hour'], bins = bins, alpha = 1, align='mid',
# remove this line
# plt.xticks([0,3,6,9,12,15,18,21,24])
edgecolor = 'white', label = '2019', density=True)
plt.title('2019 comments, 8 bins')
plt.xlabel('Hours of Day')
plt.ylabel('Relative Frequency')
plt.tight_layout()
plt.legend()
plt.show()
Or:
fig, ax = plt.subplots()
bins = np.arange(0,25,3)
comments19['comment_hour'].plot.hist(ax=ax,bins=bins)
# other plt format
If you set bins=8, seaborn will set 9 evenly spread boundaries, from the lowest value in the input array (0) to the highest (23), so at [0.0, 2.875, 5.75, 8.625, 11.5, 14.375, 17.25, 20.125, 23.0]. To get the 9 boundaries at 0, 3, 6, ... you need to set them explicitly.
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
plt.style.use('seaborn-colorblind')
comments19 = pd.DataFrame({'comment_hour': np.random.randint(0, 24, 100)})
plt.figure(figsize=(10, 5))
plt.hist(comments19['comment_hour'], bins=np.arange(0, 25, 3), alpha=1, align='mid', edgecolor='white', label='2019',
density=True)
plt.title('2019 comments, 8 bins')
plt.xticks(np.arange(0, 25, 3))
plt.xlabel('Hours of Day')
plt.ylabel('Relative Frequency')
plt.tight_layout()
plt.legend()
plt.show()
Note that your density=True means that the total area of the histogram is 1. As each bin is 3 hours wide, the sum of all the bin heights will be 0.33 and not 1.00 as you might expect. To really get a y-axis with relative frequencies, you could make the internal bin widths 1 by dividing the hours by 3. Afterwards you can relabel the x-axis back to hours.
So, following changes could be made for all the bins to sum to 100 %:
from matplotlib.ticker import PercentFormatter
plt.hist(comments19['comment_hour'] / 3, bins=np.arange(9), alpha=1, align='mid', edgecolor='white', label='2019',
density=True)
plt.xticks(np.arange(9), np.arange(0, 25, 3))
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))

Another colour missing in matplotlib

When i try plot the following data in python, i do not see the green color portion in my graph. Please find it below. Meanwhile, please be noted that I use python 2.7.4.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
range = pd.date_range('2015-01-01', '2015-12-31', freq='15min')
df = pd.DataFrame(index = range)
df
# Average speed in miles per hour
df['speed'] = np.random.randint(low=0, high=60, size=len(df.index))
# Distance in miles (speed * 0.5 hours)
df['distance'] = df['speed'] * 0.25
# Cumulative distance travelled
df['cumulative_distance'] = df.distance.cumsum()
df.head()
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(df.index, df['speed'], 'g-')
ax2.plot(df.index, df['distance'], 'b-')
ax1.set_xlabel('Date')
ax1.set_ylabel('Speed', color='g')
ax2.set_ylabel('Distance', color='b')
plt.show()
plt.rcParams['figure.figsize'] = 12,5
Speed and distance are two parameters which are directly proportional to each other. If you normalize speed/distance sets, you get exactly the same graph. As you draw your drafts with alpha=1 (opaque), then the only color you see is the last one drawn (blue). If you use alpha <> 1:
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(df.index, df['speed'], 'g-', alpha=0.5)
ax2.plot(df.index, df['distance'], 'b-', alpha=0.1)
ax1.set_xlabel('Date')
ax2.set_ylabel('Distance', color='b')
ax1.set_ylabel('Speed', color='g')
plt.show()
plt.rcParams['figure.figsize'] = 12,5
you see the green color (in fact a mixture of green and blue):

Labeling boxplot in seaborn with median value

How can I label each boxplot in a seaborn plot with the median value?
E.g.
import seaborn as sns
sns.set_style("whitegrid")
tips = sns.load_dataset("tips")
ax = sns.boxplot(x="day", y="total_bill", data=tips)
How do I label each boxplot with the median or average value?
I love when people include sample datasets!
import seaborn as sns
sns.set_style("whitegrid")
tips = sns.load_dataset("tips")
box_plot = sns.boxplot(x="day",y="total_bill",data=tips)
medians = tips.groupby(['day'])['total_bill'].median()
vertical_offset = tips['total_bill'].median() * 0.05 # offset from median for display
for xtick in box_plot.get_xticks():
box_plot.text(xtick,medians[xtick] + vertical_offset,medians[xtick],
horizontalalignment='center',size='x-small',color='w',weight='semibold')
Based on ShikharDua's approach, I created a version which works independent of tick positions. This comes in handy when dealing with grouped data in seaborn (i.e. hue=parameter). Additionally, I added a flier- and orientation-detection.
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patheffects as path_effects
def add_median_labels(ax, fmt='.1f'):
lines = ax.get_lines()
boxes = [c for c in ax.get_children() if type(c).__name__ == 'PathPatch']
lines_per_box = int(len(lines) / len(boxes))
for median in lines[4:len(lines):lines_per_box]:
x, y = (data.mean() for data in median.get_data())
# choose value depending on horizontal or vertical plot orientation
value = x if (median.get_xdata()[1] - median.get_xdata()[0]) == 0 else y
text = ax.text(x, y, f'{value:{fmt}}', ha='center', va='center',
fontweight='bold', color='white')
# create median-colored border around white text for contrast
text.set_path_effects([
path_effects.Stroke(linewidth=3, foreground=median.get_color()),
path_effects.Normal(),
])
tips = sns.load_dataset("tips")
ax = sns.boxplot(data=tips, x='day', y='total_bill', hue="sex")
add_median_labels(ax)
plt.show()
This can also be achieved by deriving median from the plot itself without exclusively computing median from data
box_plot = sns.boxplot(x="day", y="total_bill", data=tips)
ax = box_plot.axes
lines = ax.get_lines()
categories = ax.get_xticks()
for cat in categories:
# every 4th line at the interval of 6 is median line
# 0 -> p25 1 -> p75 2 -> lower whisker 3 -> upper whisker 4 -> p50 5 -> upper extreme value
y = round(lines[4+cat*6].get_ydata()[0],1)
ax.text(
cat,
y,
f'{y}',
ha='center',
va='center',
fontweight='bold',
size=10,
color='white',
bbox=dict(facecolor='#445A64'))
box_plot.figure.tight_layout()

Categories

Resources