Printing months in the x axis with pyplot - python

Data I'm working with: https://drive.google.com/file/d/1xb7icmocz-SD2Rkq4ykTZowxW0uFFhBl/view?usp=sharing
Hey everyone,
I am a bit stuck with editing a plot.
Basically, I would like my x value to display the months in the year, but it doesn't seem to work because of the data type (?). Do you have any idea how I could get my plot to have months in the x axis?
If you need more context about the data, please let me know!!!
Thank you!
Here's my code for the plot and the initial data modifications:
import matplotlib.pyplot as plt
import mplleaflet
import pandas as pd
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter
import numpy as np
df = pd.read_csv("data/C2A2_data/BinnedCsvs_d400/fb441e62df2d58994928907a91895ec62c2c42e6cd075c2700843b89.csv")
df['degrees']=df['Data_Value']/10
df['Date'] = pd.to_datetime(df['Date'])
df2 = df[df['Date']<'2015-01-01']
df3 = df[df['Date']>='2015-01-01']
max_temp = df2.groupby([(df2.Date.dt.month),(df2.Date.dt.day)])['degrees'].max()
min_temp = df2.groupby([(df2.Date.dt.month),(df2.Date.dt.day)])['degrees'].min()
max_temp2 = df3.groupby([(df3.Date.dt.month),(df3.Date.dt.day)])['degrees'].max()
min_temp2 = df3.groupby([(df3.Date.dt.month),(df3.Date.dt.day)])['degrees'].min()
max_temp.plot(x ='Date', y='degrees', kind = 'line')
min_temp.plot(x ='Date',y='degrees', kind= 'line')
plt.fill_between(range(len(min_temp)),min_temp, max_temp, color='C0', alpha=0.2)
ax = plt.gca()
ax.set(xlabel="Date",
ylabel="Temperature",
title="Extreme Weather in 2015")
plt.legend()
plt.tight_layout()
x = plt.gca().xaxis
for item in x.get_ticklabels():
item.set_rotation(45)
plt.show()
Plot I'm getting:

Option 1 (Most Similar Approach)
Change the index based on month abbreviations using Index.map and calendar
This is just for df2:
import calendar
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv("...")
df['degrees'] = df['Data_Value'] / 10
df['Date'] = pd.to_datetime(df['Date'])
df2 = df[df['Date'] < '2015-01-01']
max_temp = df2.groupby([df2.Date.dt.month, df2.Date.dt.day])['degrees'].max()
min_temp = df2.groupby([df2.Date.dt.month, df2.Date.dt.day])['degrees'].min()
# Update the index to be the desired display format for x-axis
max_temp.index = max_temp.index.map(lambda x: f'{calendar.month_abbr[x[0]]}')
min_temp.index = min_temp.index.map(lambda x: f'{calendar.month_abbr[x[0]]}')
max_temp.plot(x='Date', y='degrees', kind='line')
min_temp.plot(x='Date', y='degrees', kind='line')
plt.fill_between(range(len(min_temp)), min_temp, max_temp,
color='C0', alpha=0.2)
ax = plt.gca()
ax.set(xlabel="Date", ylabel="Temperature", title="Extreme Weather 2005-2014")
x = plt.gca().xaxis
for item in x.get_ticklabels():
item.set_rotation(45)
plt.margins(x=0)
plt.legend()
plt.tight_layout()
plt.show()
As an aside: the title "Extreme Weather in 2015" is incorrect because this data includes all years before 2015. This is "Extreme Weather 2005-2014"
The year range can be checked with min and max as well:
print(df2.Date.dt.year.min(), '-', df2.Date.dt.year.max())
# 2005 - 2014
The title could be programmatically generated with:
title=f"Extreme Weather {df2.Date.dt.year.min()}-{df2.Date.dt.year.max()}"
Option 2 (Simplifying groupby step)
Simplify the code using groupby aggregate to create a single DataFrame then convert the index in the same way as above:
import calendar
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv("...")
df['degrees'] = df['Data_Value'] / 10
df['Date'] = pd.to_datetime(df['Date'])
df2 = df[df['Date'] < '2015-01-01']
# Get Max and Min Degrees in Single Groupby
df2_temp = (
df2.groupby([df2.Date.dt.month, df2.Date.dt.day])['degrees']
.agg(['max', 'min'])
)
# Convert Index to whatever display format is desired:
df2_temp.index = df2_temp.index.map(lambda x: f'{calendar.month_abbr[x[0]]}')
# Plot
ax = df2_temp.plot(
kind='line', rot=45,
xlabel="Date", ylabel="Temperature",
title=f"Extreme Weather {df2.Date.dt.year.min()}-{df2.Date.dt.year.max()}"
)
# Fill between
plt.fill_between(range(len(df2_temp)), df2_temp['min'], df2_temp['max'],
color='C0', alpha=0.2)
plt.margins(x=0)
plt.tight_layout()
plt.show()
Option 3 (Best overall functionality)
Convert the index to a datetime using pd.to_datetime. Choose any leap year to uniform the data (it must be a leap year so Feb-29 does not raise an error). Then set the set_major_formatter using the format string %b to use the month abbreviation:
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv("...")
df['degrees'] = df['Data_Value'] / 10
df['Date'] = pd.to_datetime(df['Date'])
df2 = df[df['Date'] < '2015-01-01']
# Get Max and Min Degrees in Single Groupby
df2_temp = (
df2.groupby([df2.Date.dt.month, df2.Date.dt.day])['degrees']
.agg(['max', 'min'])
)
# Convert to DateTime of Same Year
# (Must be a leap year so Feb-29 doesn't raise an error)
df2_temp.index = pd.to_datetime(
'2000-' + df2_temp.index.map(lambda s: '-'.join(map(str, s)))
)
# Plot
ax = df2_temp.plot(
kind='line', rot=45,
xlabel="Date", ylabel="Temperature",
title=f"Extreme Weather {df2.Date.dt.year.min()}-{df2.Date.dt.year.max()}"
)
# Fill between
plt.fill_between(df2_temp.index, df2_temp['min'], df2_temp['max'],
color='C0', alpha=0.2)
# Set xaxis formatter to month abbr with the %b format string
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
plt.tight_layout()
plt.show()
The benefit of this approach is that the index is a datetime and therefore will format better than the string representations of options 1 and 2.

Related

How can I order dates and show only month+year on the x axis in matplotlib?

I would like to improve my bitcoin dataset but I found that the date is not sorted in the right way and want to show only the month and year. How can I do it?
data = Bitcoin_Historical['Price']
Date1 = Bitcoin_Historical['Date']
train1 = Bitcoin_Historical[['Date','Price']]
#Setting the Date as Index
train2 = train1.set_index('Date')
train2.sort_index(inplace=True)
cols = ['Price']
train2 = train2[cols].apply(lambda x: pd.to_numeric(x.astype(str)
.str.replace(',',''), errors='coerce'))
print (type(train2))
print (train2.head())
plt.figure(figsize=(15, 5))
plt.plot(train2)
plt.xlabel('Date', fontsize=12)
plt.xlim(0,20)
plt.ylabel('Price', fontsize=12)
plt.title("Closing price distribution of bitcoin", fontsize=15)
plt.gcf().autofmt_xdate()
plt.show()
The result shows picture below:
It's not ordered and shows all dates. I would like to order by month+year and show only the month name+year. How can that be done?
Example of Data:
Thank you
I've made the following edits to your code:
converted the column Date column as datetime type
cleaned up the Price column and converting to float
removed the line plt.xlim(0,20) which is causing the output to display 1970
used alternative way to plot, so that the x-axis can be formatted to get monthly tick marks, more info here
Please try the code below:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
pd.options.mode.chained_assignment = None
Bitcoin_Historical = pd.read_csv('data.csv')
train1 = Bitcoin_Historical[['Date','Price']]
train1['Date'] = pd.to_datetime(train1['Date'], infer_datetime_format=True, errors='coerce')
train1['Price'] = train1['Price'].str.replace(',','').str.replace(' ','').astype(float)
train2 = train1.set_index('Date') #Setting the Date as Index
train2.sort_index(inplace=True)
print (type(train2))
print (train2.head())
ax = train2.plot(figsize=(15, 5))
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%b'))
plt.xlabel('Date', fontsize=12)
plt.ylabel('Price', fontsize=12)
plt.title("Closing price distribution of bitcoin", fontsize=15)
plt.show()
Output
Try to cast your "Date" column into datetime, check if it does the trick:
train1.Date = pd.to_datetime(train1.Date)
train2 = train1.set_index('Date')

How to specify the years on an axis when using plot() on DateTime objects

The plotting goes like this:
plt.plot(df['Date'], df['Price'])
df['Date'] consists of DateTime objects with several years and df['Price'] are integers.
However on the actual line graph, it automatically selects about 4 years spaced apart in the graph with large intervals:
How do I make it so that I can specify the number of years to show on the X axis? Or perhaps show all the years (year only)?
Example:
import pandas as pd
import datetime
import random
dates = []
prices = []
for count in range(10000):
prices.append(random.randint(0, 10))
dates.append(datetime.datetime(random.randint(1960, 2022), random.randint(1, 12), random.randint(1, 27)).strftime("%Y-%m-%d"))
data = {
'Date': dates,
'Price': prices
}
df = pd.DataFrame(data)
df = df.sort_values(by=['Date'], ignore_index = True)
df_temp = df.copy()
df_temp['Date DT'] = pd.to_datetime(df_temp['Date'])
df_temp = df_temp.drop(axis = 'columns', columns = 'Date')
df = df_temp
df
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
plt.figure(figsize=(15, 5), dpi = 1000)
plt.plot(df['Date DT'], df['Price'])
# Labels
plt.xlabel('Dates', fontsize = 8)
plt.ylabel('Prices', fontsize = 8)
# Save
plt.title('Example', fontsize = 15)
plt.savefig('example.png', bbox_inches = 'tight')
You can use the xticks function to set the tick marks and labels on the x-axis.
Like This:
plt.plot(df['Date'], df['Price'])
years = [date.year for date in df['Date']]
plt.xticks(df['Date'], years, rotation=45)
plt.show()
EDIT: to only display one unique year each out of years:
import numpy as np
plt.plot(df['Date'], df['Price'])
# Extract the unique years from the 'Date' column
years = np.unique([date.year for date in df['Date']])
# Set the tick marks on the x-axis
plt.xticks(df['Date'], rotation=45)
# Set the labels of the tick marks on the x-axis
plt.gca().set_xticklabels(years)
plt.show()

How to create a yearly bar plot grouped by months

I'm having a difficult time trying to create a bar plot with and DataFrame grouped by year and month. With the following code I'm trying to plot the data in the created image, instead of that, is returning a second image. Also I tried to move the legend to the right and change its values to the corresponding month.
I started to get a feel for the DataFrames obtained with the groupby command, though not getting what I expected led me to ask you guys.
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
df = pd.read_csv('fcc-forum-pageviews.csv', index_col='date')
line_plot = df.value[(df.value > df.value.quantile(0.025)) & (df.value < df.value.quantile(0.975))]
fig, ax = plt.subplots(figsize=(10,10))
bar_plot = line_plot.groupby([line_plot.index.year, line_plot.index.month]).mean().unstack()
bar_plot.plot(kind='bar')
ax.set_xlabel('Years')
ax.set_ylabel('Average Page Views')
plt.show()
This is the format of the data that I am analyzing.
date,value
2016-05-09,1201
2016-05-10,2329
2016-05-11,1716
2016-05-12,10539
2016-05-13,6933
Add a sorted categorical 'month' column with pd.Categorical
Transform the dataframe to a wide format with pd.pivot_table where aggfunc='mean' is the default.
Wide format is typically best for plotting grouped bars.
pandas.DataFrame.plot returns matplotlib.axes.Axes, so there's no need to use fig, ax = plt.subplots(figsize=(10,10)).
The pandas .dt accessor is used to extract various components of 'date', which must be a datetime dtype
If 'date' is not a datetime dtype, then transform it with df.date = pd.to_datetime(df.date).
Tested with python 3.8.11, pandas 1.3.1, and matplotlib 3.4.2
Imports and Test Data
import pandas as pd
from calendar import month_name # conveniently supplies a list of sorted month names or you can type them out manually
import numpy as np # for test data
# test data and dataframe
np.random.seed(365)
rows = 365 * 3
data = {'date': pd.bdate_range('2021-01-01', freq='D', periods=rows), 'value': np.random.randint(100, 1001, size=(rows))}
df = pd.DataFrame(data)
# select data within specified quantiles
df = df[df.value.gt(df.value.quantile(0.025)) & df.value.lt(df.value.quantile(0.975))]
# display(df.head())
date value
0 2021-01-01 694
1 2021-01-02 792
2 2021-01-03 901
3 2021-01-04 959
4 2021-01-05 528
Transform and Plot
If 'date' has been set to the index, as stated in the comments, use the following:
df['months'] = pd.Categorical(df.index.strftime('%B'), categories=months, ordered=True)
# create the month column
months = month_name[1:]
df['months'] = pd.Categorical(df.date.dt.strftime('%B'), categories=months, ordered=True)
# pivot the dataframe into the correct shape
dfp = pd.pivot_table(data=df, index=df.date.dt.year, columns='months', values='value')
# display(dfp.head())
months January February March April May June July August September October November December
date
2021 637.9 595.7 569.8 508.3 589.4 557.7 508.2 545.7 560.3 526.2 577.1 546.8
2022 567.9 521.5 625.5 469.8 582.6 627.3 630.4 474.0 544.1 609.6 526.6 572.1
2023 521.1 548.5 484.0 528.2 473.3 547.7 525.3 522.4 424.7 561.3 513.9 602.3
# plot
ax = dfp.plot(kind='bar', figsize=(12, 4), ylabel='Mean Page Views', xlabel='Year', rot=0)
_ = ax.legend(bbox_to_anchor=(1, 1.02), loc='upper left')
Just pass the ax you defined to pandas:
bar_plot.plot(ax = ax, kind='bar')
If you also want to replace months numbers with names, you have to get those labels, replace numbers with names and re-define the legend by passing to it the new labels:
handles, labels = ax.get_legend_handles_labels()
new_labels = [datetime.date(1900, int(monthinteger), 1).strftime('%B') for monthinteger in labels]
ax.legend(handles = handles, labels = new_labels, loc = 'upper left', bbox_to_anchor = (1.02, 1))
Complete Code
import pandas as pd
from matplotlib import pyplot as plt
import datetime
df = pd.read_csv('fcc-forum-pageviews.csv')
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')
line_plot = df.value[(df.value > df.value.quantile(0.025)) & (df.value < df.value.quantile(0.975))]
fig, ax = plt.subplots(figsize=(10,10))
bar_plot = line_plot.groupby([line_plot.index.year, line_plot.index.month]).mean().unstack()
bar_plot.plot(ax = ax, kind='bar')
ax.set_xlabel('Years')
ax.set_ylabel('Average Page Views')
handles, labels = ax.get_legend_handles_labels()
new_labels = [datetime.date(1900, int(monthinteger), 1).strftime('%B') for monthinteger in labels]
ax.legend(handles = handles, labels = new_labels, loc = 'upper left', bbox_to_anchor = (1.02, 1))
plt.show()
(plot generated with fake data)

How to create a min-max lineplot by month

I have retail beef ad counts time series data, and I intend to make stacked line chart aim to show On a three-week average basis, quantity of average ads that grocers posted per store last week. To do so, I managed to aggregate data for plotting and tried to make line chart that I want. The main motivation is based on context of the problem and desired plot. In my attempt, I couldn't get very nice line chart because it is not informative to understand. I am wondering how can I achieve this goal in matplotlib. Can anyone suggest me what should I do from my current attempt? Any thoughts?
reproducible data and current attempt
Here is minimal reproducible data that I used in my current attempt:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from datetime import timedelta, datetime
url = 'https://gist.githubusercontent.com/adamFlyn/96e68902d8f71ad62a4d3cda135507ad/raw/4761264cbd55c81cf003a4219fea6a24740d7ce9/df.csv'
df = pd.read_csv(url, parse_dates=['date'])
df.drop(columns=['Unnamed: 0'], inplace=True)
df_grp = df.groupby(['date', 'retail_item']).agg({'number_of_ads': 'sum'})
df_grp["percentage"] = df_grp.groupby(level=0).apply(lambda x:100 * x / float(x.sum()))
df_grp = df_grp.reset_index(level=[0,1])
for item in df_grp['retail_item'].unique():
dd = df_grp[df_grp['retail_item'] == item].groupby(['date', 'percentage'])[['number_of_ads']].sum().reset_index(level=[0,1])
dd['weakly_change'] = dd[['percentage']].rolling(7).mean()
fig, ax = plt.subplots(figsize=(8, 6), dpi=144)
sns.lineplot(dd.index, 'weakly_change', data=dd, ax=ax)
ax.set_xlim(dd.index.min(), dd.index.max())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.gcf().autofmt_xdate()
plt.style.use('ggplot')
plt.xticks(rotation=90)
plt.show()
Current Result
but I couldn't get correct line chart that I expected, I want to reproduce the plot from this site. Is that doable to achieve this? Any idea?
desired plot
here is the example desired plot that I want to make from this minimal reproducible data:
I don't know how should make changes for my current attempt to get my desired plot above. Can anyone know any possible way of doing this in matplotlib? what else should I do? Any possible help would be appreciated. Thanks
Also see How to create a min-max plot by month with fill_between?
See in-line comments for details
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import calendar
#################################################################
# setup from question
url = 'https://gist.githubusercontent.com/adamFlyn/96e68902d8f71ad62a4d3cda135507ad/raw/4761264cbd55c81cf003a4219fea6a24740d7ce9/df.csv'
df = pd.read_csv(url, parse_dates=['date'])
df.drop(columns=['Unnamed: 0'], inplace=True)
df_grp = df.groupby(['date', 'retail_item']).agg({'number_of_ads': 'sum'})
df_grp["percentage"] = df_grp.groupby(level=0).apply(lambda x:100 * x / float(x.sum()))
df_grp = df_grp.reset_index(level=[0,1])
#################################################################
# create a month map from long to abbreviated calendar names
month_map = dict(zip(calendar.month_name[1:], calendar.month_abbr[1:]))
# update the month column name
df_grp['month'] = df_grp.date.dt.month_name().map(month_map)
# set month as categorical so they are plotted in the correct order
df_grp.month = pd.Categorical(df_grp.month, categories=month_map.values(), ordered=True)
# use groupby to aggregate min mean and max
dfmm = df_grp.groupby(['retail_item', 'month'])['percentage'].agg([max, min, 'mean']).stack().reset_index(level=[2]).rename(columns={'level_2': 'mm', 0: 'vals'}).reset_index()
# create a palette map for line colors
cmap = {'min': 'k', 'max': 'k', 'mean': 'b'}
# iterate through each retail item and plot the corresponding data
for g, d in dfmm.groupby('retail_item'):
plt.figure(figsize=(7, 4))
sns.lineplot(x='month', y='vals', hue='mm', data=d, palette=cmap)
# select only min or max data for fill_between
y1 = d[d.mm == 'max']
y2 = d[d.mm == 'min']
plt.fill_between(x=y1.month, y1=y1.vals, y2=y2.vals, color='gainsboro')
# add lines for specific years
for year in [2016, 2018, 2020]:
data = df_grp[(df_grp.date.dt.year == year) & (df_grp.retail_item == g)]
sns.lineplot(x='month', y='percentage', ci=None, data=data, label=year)
plt.ylim(0, 100)
plt.margins(0, 0)
plt.legend(bbox_to_anchor=(1., 1), loc='upper left')
plt.ylabel('Percentage of Ads')
plt.title(g)
plt.show()

Heatmap with pandas DateTimeIndex on both axis

I would like to make a heatmap from a pandas DataFrame (or Series) with DateTimeIndex so that I have hours on the x-axis and days on the y-axis, both ticklabels displayed in DateTimeIndex style.
If I do the following:
import pandas as pd
import numpy as np
import seaborn as sns
df = pd.DataFrame(np.random.randint(10, size=4*24*200))
df.index = pd.date_range(start='2019-02-01 11:30:00', periods=200*24*4, freq='15min')
df['minute'] = df.index.hour*60 + df.index.minute
df['dayofyear'] = df.index.month + df.index.dayofyear
df = df.pivot(index='dayofyear', columns='minute', values=df.columns[0])
sns.heatmap(df)
The index obviously loses the DateTime format:
What I instead want is something like this (which I achieved with a complicated, not generalizable function that apparently doesn't even work properly):
Does someone know a neat way to create this kind of heatmap with python?
EDIT:
The function I created:
def plot_heatmap(df_in, plot_column=0, figsize=(20,12), vmin=None, vmax=None, cmap='jet', xlabel='hour (UTC)', ylabel='day', rotation=0, freq='5s'):
'''
Plots heatmap with date labels
df_in: pandas DataFrame od pandas Series
plot_column: column to plot if DataFrame has multiple columns
...
'''
# convert to DataFrame in case a Series is passed:
try:
df_in = df_in.to_frame()
except AttributeError:
pass
# make copy in order not to overrite input (in case input is an object attribute)
df = df_in.copy()
# pad missing dates:
idx = pd.date_range(df_in.index[0], df_in.index[-1], freq=freq)
df = df.reindex(idx, fill_value=np.nan)
df['hour'] = df.index.hour*3600 + df.index.minute*60 + df.index.second
df['dayofyear'] = df.index.month + df.index.dayofyear
# Create mesh for heatmap plotting:
pivot = df.pivot(index='dayofyear', columns='hour', values=df.columns[plot_column])
# plot
plt.figure(figsize=figsize)
sns.heatmap(pivot, cmap=cmap)
# set xticks
plt.xticks(np.linspace(0,pivot.shape[1],25), labels=range(25))
plt.xlabel(xlabel)
# set yticks
ylabels = []
ypositions = []
day0 = df['dayofyear'].unique().min()
for day in df['dayofyear'].unique():
day_delta = day-day0
# create pandas Timestamp
temp_tick = df.index[0] + pd.Timedelta('%sD' %day_delta)
# check wheter tick shall be shown or not
if temp_tick.day==1 or temp_tick.day==15:
temp_tick_nice = '%s-%s-%s' %(temp_tick.year, temp_tick.month, temp_tick.day)
ylabels.append(temp_tick_nice)
ypositions.append(day_delta)
plt.yticks(ticks=ypositions, labels=ylabels, rotation=0)
plt.ylabel(ylabel)
The date format going away because you did:
df['dayofyear'] = df.index.month + df.index.dayofyear
Here, both series are integers, so df['dayofyear'] is also integer-typed.
Instead, do:
df['dayofyear'] = df.index.date
Then you get as output:
The best solution I found now that also works if the frequency of the DatetimeIndex is <1min is the following:
import pandas as pd
import numpy as np
import seaborn as sns
freq = '30s'
df = pd.DataFrame(np.random.randint(10, size=4*24*200*20))
df.index = pd.date_range(start='2019-02-01 11:30:00', periods=200*24*4*20, freq=freq)
df['hour'] = df.index.strftime('%H:%M:%S')
df['dayofyear'] = df.index.date
df = df.pivot(index='dayofyear', columns='hour', values=df.columns[0])
df.columns = pd.DatetimeIndex(df.columns).strftime('%H:%M')
df.index = pd.DatetimeIndex(df.index).strftime('%m/%Y')
xticks_spacing = int(pd.Timedelta('2h')/pd.Timedelta(freq))
ax = sns.heatmap(df, xticklabels=xticks_spacing, yticklabels=30)
plt.yticks(rotation=0)
Which produces this result:
The only flaw yet is that the month ticks positions are not well defined and precise with this method...

Categories

Resources