Pandas: plot date and time vs value - python

In pandas, my dataframe has the following structure:
raw_data = {'date': ['1975-07-03','1975-07-03','1975-07-04','1975-08-01'],
'time': [515,1014,1401,1201], 'value': [1,-1,2,11]}
df = pd.DataFrame(raw_data, columns = ['date', 'time', 'value'])
This question is similar to this one, but I cannot figure out how to modify it.
I need to plot the values in the column "value" versus the two columns "date" and "time". Note that here "time" really is hh:mm.
Edit
Since the year does not change on the x-axis I should have date and time in the format "Month-Day Hour:Minute"

IIUC:
(df.assign(date=pd.to_datetime(df['date'] + ' ' + df['time'].astype(str).replace(r'(\d){2})(\d{2})', r'\1:\2')))
.plot(x='date', y='value'))

Extending the other answer to include marking specific data points as ticklabels/ticks can be done by using date2num to convert the dates into their tick positions. There are probably better ways to manipulate the date formatting in matplotlib but this method will work.
EDIT: Ensure padding of hhmm if less than 4 characters, more ideomatic pandas
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
raw_data = {'date': ['1975-07-03','1975-07-03','1975-07-04','1975-08-01'],
'time': [415,1014,1401,1201], 'value': [1,-1,2,11]}
def fix_time_str(df):
df['date'] = (df['date'] + ' ' +
df['time'].apply(lambda x: str(x).zfill(4)).replace(r'(\d){2})(\d{2})', r'\1:\2'))
return df
df = (pd.DataFrame(raw_data, columns = ['date', 'time', 'value'])).pipe(fix_time_str).assign(date= lambda x: pd.to_datetime(x['date']))
fig, ax = plt.subplots(1,1, figsize = (8,5))
xtick_locs = mpl.dates.date2num(df['date'].tolist())
xtick_labels = df['date'].astype(str).tolist()
xtick_labels = ["{}-{}".format(*i.split('-')[1:])[:-3] for i in xtick_labels]
ax.plot(df['date'], df['value'])
ax.set_xticks(xtick_locs)
ax.set_xticklabels(xtick_labels)
ax.tick_params(axis='x', rotation=90)

Related

Printing months in the x axis with pyplot

Data I'm working with: https://drive.google.com/file/d/1xb7icmocz-SD2Rkq4ykTZowxW0uFFhBl/view?usp=sharing
Hey everyone,
I am a bit stuck with editing a plot.
Basically, I would like my x value to display the months in the year, but it doesn't seem to work because of the data type (?). Do you have any idea how I could get my plot to have months in the x axis?
If you need more context about the data, please let me know!!!
Thank you!
Here's my code for the plot and the initial data modifications:
import matplotlib.pyplot as plt
import mplleaflet
import pandas as pd
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter
import numpy as np
df = pd.read_csv("data/C2A2_data/BinnedCsvs_d400/fb441e62df2d58994928907a91895ec62c2c42e6cd075c2700843b89.csv")
df['degrees']=df['Data_Value']/10
df['Date'] = pd.to_datetime(df['Date'])
df2 = df[df['Date']<'2015-01-01']
df3 = df[df['Date']>='2015-01-01']
max_temp = df2.groupby([(df2.Date.dt.month),(df2.Date.dt.day)])['degrees'].max()
min_temp = df2.groupby([(df2.Date.dt.month),(df2.Date.dt.day)])['degrees'].min()
max_temp2 = df3.groupby([(df3.Date.dt.month),(df3.Date.dt.day)])['degrees'].max()
min_temp2 = df3.groupby([(df3.Date.dt.month),(df3.Date.dt.day)])['degrees'].min()
max_temp.plot(x ='Date', y='degrees', kind = 'line')
min_temp.plot(x ='Date',y='degrees', kind= 'line')
plt.fill_between(range(len(min_temp)),min_temp, max_temp, color='C0', alpha=0.2)
ax = plt.gca()
ax.set(xlabel="Date",
ylabel="Temperature",
title="Extreme Weather in 2015")
plt.legend()
plt.tight_layout()
x = plt.gca().xaxis
for item in x.get_ticklabels():
item.set_rotation(45)
plt.show()
Plot I'm getting:
Option 1 (Most Similar Approach)
Change the index based on month abbreviations using Index.map and calendar
This is just for df2:
import calendar
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv("...")
df['degrees'] = df['Data_Value'] / 10
df['Date'] = pd.to_datetime(df['Date'])
df2 = df[df['Date'] < '2015-01-01']
max_temp = df2.groupby([df2.Date.dt.month, df2.Date.dt.day])['degrees'].max()
min_temp = df2.groupby([df2.Date.dt.month, df2.Date.dt.day])['degrees'].min()
# Update the index to be the desired display format for x-axis
max_temp.index = max_temp.index.map(lambda x: f'{calendar.month_abbr[x[0]]}')
min_temp.index = min_temp.index.map(lambda x: f'{calendar.month_abbr[x[0]]}')
max_temp.plot(x='Date', y='degrees', kind='line')
min_temp.plot(x='Date', y='degrees', kind='line')
plt.fill_between(range(len(min_temp)), min_temp, max_temp,
color='C0', alpha=0.2)
ax = plt.gca()
ax.set(xlabel="Date", ylabel="Temperature", title="Extreme Weather 2005-2014")
x = plt.gca().xaxis
for item in x.get_ticklabels():
item.set_rotation(45)
plt.margins(x=0)
plt.legend()
plt.tight_layout()
plt.show()
As an aside: the title "Extreme Weather in 2015" is incorrect because this data includes all years before 2015. This is "Extreme Weather 2005-2014"
The year range can be checked with min and max as well:
print(df2.Date.dt.year.min(), '-', df2.Date.dt.year.max())
# 2005 - 2014
The title could be programmatically generated with:
title=f"Extreme Weather {df2.Date.dt.year.min()}-{df2.Date.dt.year.max()}"
Option 2 (Simplifying groupby step)
Simplify the code using groupby aggregate to create a single DataFrame then convert the index in the same way as above:
import calendar
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv("...")
df['degrees'] = df['Data_Value'] / 10
df['Date'] = pd.to_datetime(df['Date'])
df2 = df[df['Date'] < '2015-01-01']
# Get Max and Min Degrees in Single Groupby
df2_temp = (
df2.groupby([df2.Date.dt.month, df2.Date.dt.day])['degrees']
.agg(['max', 'min'])
)
# Convert Index to whatever display format is desired:
df2_temp.index = df2_temp.index.map(lambda x: f'{calendar.month_abbr[x[0]]}')
# Plot
ax = df2_temp.plot(
kind='line', rot=45,
xlabel="Date", ylabel="Temperature",
title=f"Extreme Weather {df2.Date.dt.year.min()}-{df2.Date.dt.year.max()}"
)
# Fill between
plt.fill_between(range(len(df2_temp)), df2_temp['min'], df2_temp['max'],
color='C0', alpha=0.2)
plt.margins(x=0)
plt.tight_layout()
plt.show()
Option 3 (Best overall functionality)
Convert the index to a datetime using pd.to_datetime. Choose any leap year to uniform the data (it must be a leap year so Feb-29 does not raise an error). Then set the set_major_formatter using the format string %b to use the month abbreviation:
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv("...")
df['degrees'] = df['Data_Value'] / 10
df['Date'] = pd.to_datetime(df['Date'])
df2 = df[df['Date'] < '2015-01-01']
# Get Max and Min Degrees in Single Groupby
df2_temp = (
df2.groupby([df2.Date.dt.month, df2.Date.dt.day])['degrees']
.agg(['max', 'min'])
)
# Convert to DateTime of Same Year
# (Must be a leap year so Feb-29 doesn't raise an error)
df2_temp.index = pd.to_datetime(
'2000-' + df2_temp.index.map(lambda s: '-'.join(map(str, s)))
)
# Plot
ax = df2_temp.plot(
kind='line', rot=45,
xlabel="Date", ylabel="Temperature",
title=f"Extreme Weather {df2.Date.dt.year.min()}-{df2.Date.dt.year.max()}"
)
# Fill between
plt.fill_between(df2_temp.index, df2_temp['min'], df2_temp['max'],
color='C0', alpha=0.2)
# Set xaxis formatter to month abbr with the %b format string
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
plt.tight_layout()
plt.show()
The benefit of this approach is that the index is a datetime and therefore will format better than the string representations of options 1 and 2.

How to create a min-max lineplot by month

I have retail beef ad counts time series data, and I intend to make stacked line chart aim to show On a three-week average basis, quantity of average ads that grocers posted per store last week. To do so, I managed to aggregate data for plotting and tried to make line chart that I want. The main motivation is based on context of the problem and desired plot. In my attempt, I couldn't get very nice line chart because it is not informative to understand. I am wondering how can I achieve this goal in matplotlib. Can anyone suggest me what should I do from my current attempt? Any thoughts?
reproducible data and current attempt
Here is minimal reproducible data that I used in my current attempt:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from datetime import timedelta, datetime
url = 'https://gist.githubusercontent.com/adamFlyn/96e68902d8f71ad62a4d3cda135507ad/raw/4761264cbd55c81cf003a4219fea6a24740d7ce9/df.csv'
df = pd.read_csv(url, parse_dates=['date'])
df.drop(columns=['Unnamed: 0'], inplace=True)
df_grp = df.groupby(['date', 'retail_item']).agg({'number_of_ads': 'sum'})
df_grp["percentage"] = df_grp.groupby(level=0).apply(lambda x:100 * x / float(x.sum()))
df_grp = df_grp.reset_index(level=[0,1])
for item in df_grp['retail_item'].unique():
dd = df_grp[df_grp['retail_item'] == item].groupby(['date', 'percentage'])[['number_of_ads']].sum().reset_index(level=[0,1])
dd['weakly_change'] = dd[['percentage']].rolling(7).mean()
fig, ax = plt.subplots(figsize=(8, 6), dpi=144)
sns.lineplot(dd.index, 'weakly_change', data=dd, ax=ax)
ax.set_xlim(dd.index.min(), dd.index.max())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.gcf().autofmt_xdate()
plt.style.use('ggplot')
plt.xticks(rotation=90)
plt.show()
Current Result
but I couldn't get correct line chart that I expected, I want to reproduce the plot from this site. Is that doable to achieve this? Any idea?
desired plot
here is the example desired plot that I want to make from this minimal reproducible data:
I don't know how should make changes for my current attempt to get my desired plot above. Can anyone know any possible way of doing this in matplotlib? what else should I do? Any possible help would be appreciated. Thanks
Also see How to create a min-max plot by month with fill_between?
See in-line comments for details
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import calendar
#################################################################
# setup from question
url = 'https://gist.githubusercontent.com/adamFlyn/96e68902d8f71ad62a4d3cda135507ad/raw/4761264cbd55c81cf003a4219fea6a24740d7ce9/df.csv'
df = pd.read_csv(url, parse_dates=['date'])
df.drop(columns=['Unnamed: 0'], inplace=True)
df_grp = df.groupby(['date', 'retail_item']).agg({'number_of_ads': 'sum'})
df_grp["percentage"] = df_grp.groupby(level=0).apply(lambda x:100 * x / float(x.sum()))
df_grp = df_grp.reset_index(level=[0,1])
#################################################################
# create a month map from long to abbreviated calendar names
month_map = dict(zip(calendar.month_name[1:], calendar.month_abbr[1:]))
# update the month column name
df_grp['month'] = df_grp.date.dt.month_name().map(month_map)
# set month as categorical so they are plotted in the correct order
df_grp.month = pd.Categorical(df_grp.month, categories=month_map.values(), ordered=True)
# use groupby to aggregate min mean and max
dfmm = df_grp.groupby(['retail_item', 'month'])['percentage'].agg([max, min, 'mean']).stack().reset_index(level=[2]).rename(columns={'level_2': 'mm', 0: 'vals'}).reset_index()
# create a palette map for line colors
cmap = {'min': 'k', 'max': 'k', 'mean': 'b'}
# iterate through each retail item and plot the corresponding data
for g, d in dfmm.groupby('retail_item'):
plt.figure(figsize=(7, 4))
sns.lineplot(x='month', y='vals', hue='mm', data=d, palette=cmap)
# select only min or max data for fill_between
y1 = d[d.mm == 'max']
y2 = d[d.mm == 'min']
plt.fill_between(x=y1.month, y1=y1.vals, y2=y2.vals, color='gainsboro')
# add lines for specific years
for year in [2016, 2018, 2020]:
data = df_grp[(df_grp.date.dt.year == year) & (df_grp.retail_item == g)]
sns.lineplot(x='month', y='percentage', ci=None, data=data, label=year)
plt.ylim(0, 100)
plt.margins(0, 0)
plt.legend(bbox_to_anchor=(1., 1), loc='upper left')
plt.ylabel('Percentage of Ads')
plt.title(g)
plt.show()

Heatmap with pandas DateTimeIndex on both axis

I would like to make a heatmap from a pandas DataFrame (or Series) with DateTimeIndex so that I have hours on the x-axis and days on the y-axis, both ticklabels displayed in DateTimeIndex style.
If I do the following:
import pandas as pd
import numpy as np
import seaborn as sns
df = pd.DataFrame(np.random.randint(10, size=4*24*200))
df.index = pd.date_range(start='2019-02-01 11:30:00', periods=200*24*4, freq='15min')
df['minute'] = df.index.hour*60 + df.index.minute
df['dayofyear'] = df.index.month + df.index.dayofyear
df = df.pivot(index='dayofyear', columns='minute', values=df.columns[0])
sns.heatmap(df)
The index obviously loses the DateTime format:
What I instead want is something like this (which I achieved with a complicated, not generalizable function that apparently doesn't even work properly):
Does someone know a neat way to create this kind of heatmap with python?
EDIT:
The function I created:
def plot_heatmap(df_in, plot_column=0, figsize=(20,12), vmin=None, vmax=None, cmap='jet', xlabel='hour (UTC)', ylabel='day', rotation=0, freq='5s'):
'''
Plots heatmap with date labels
df_in: pandas DataFrame od pandas Series
plot_column: column to plot if DataFrame has multiple columns
...
'''
# convert to DataFrame in case a Series is passed:
try:
df_in = df_in.to_frame()
except AttributeError:
pass
# make copy in order not to overrite input (in case input is an object attribute)
df = df_in.copy()
# pad missing dates:
idx = pd.date_range(df_in.index[0], df_in.index[-1], freq=freq)
df = df.reindex(idx, fill_value=np.nan)
df['hour'] = df.index.hour*3600 + df.index.minute*60 + df.index.second
df['dayofyear'] = df.index.month + df.index.dayofyear
# Create mesh for heatmap plotting:
pivot = df.pivot(index='dayofyear', columns='hour', values=df.columns[plot_column])
# plot
plt.figure(figsize=figsize)
sns.heatmap(pivot, cmap=cmap)
# set xticks
plt.xticks(np.linspace(0,pivot.shape[1],25), labels=range(25))
plt.xlabel(xlabel)
# set yticks
ylabels = []
ypositions = []
day0 = df['dayofyear'].unique().min()
for day in df['dayofyear'].unique():
day_delta = day-day0
# create pandas Timestamp
temp_tick = df.index[0] + pd.Timedelta('%sD' %day_delta)
# check wheter tick shall be shown or not
if temp_tick.day==1 or temp_tick.day==15:
temp_tick_nice = '%s-%s-%s' %(temp_tick.year, temp_tick.month, temp_tick.day)
ylabels.append(temp_tick_nice)
ypositions.append(day_delta)
plt.yticks(ticks=ypositions, labels=ylabels, rotation=0)
plt.ylabel(ylabel)
The date format going away because you did:
df['dayofyear'] = df.index.month + df.index.dayofyear
Here, both series are integers, so df['dayofyear'] is also integer-typed.
Instead, do:
df['dayofyear'] = df.index.date
Then you get as output:
The best solution I found now that also works if the frequency of the DatetimeIndex is <1min is the following:
import pandas as pd
import numpy as np
import seaborn as sns
freq = '30s'
df = pd.DataFrame(np.random.randint(10, size=4*24*200*20))
df.index = pd.date_range(start='2019-02-01 11:30:00', periods=200*24*4*20, freq=freq)
df['hour'] = df.index.strftime('%H:%M:%S')
df['dayofyear'] = df.index.date
df = df.pivot(index='dayofyear', columns='hour', values=df.columns[0])
df.columns = pd.DatetimeIndex(df.columns).strftime('%H:%M')
df.index = pd.DatetimeIndex(df.index).strftime('%m/%Y')
xticks_spacing = int(pd.Timedelta('2h')/pd.Timedelta(freq))
ax = sns.heatmap(df, xticklabels=xticks_spacing, yticklabels=30)
plt.yticks(rotation=0)
Which produces this result:
The only flaw yet is that the month ticks positions are not well defined and precise with this method...

Python/Pandas Set y-axis attribute

Code:
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
import pandas as pd
import sys
import matplotlib
import numpy as np
import pylab as pl
pl.draw()
df = pd.read_csv("TB_burden_countries_2018-03-06.csv")
df = df.set_index(["country"])
df2 = df.loc["Zimbabwe", "e_mort_num"]
df = pd.DataFrame(data = df2, columns= ["e_mort_num"])
df.columns = ["Mortality"]
print(df2)
dataset = {'Year': ["2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007","2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015","2016"],
'Mortality': ["20000", "18000", "17000", "19000", "19000", "22000", "24000", "24000", "23000", "17000", "13000", "14000","14000", "11000", "11000", "9000","5600"]}
df3 = pd.DataFrame.from_dict(dataset)
df4 = df3[["Year", "Mortality"]]
plt.bar(df3['Year'], df3['Mortality'])
plt.title('Zimbabwe')
plt.xticks(df3['Year'], rotation=90)
plt.xlabel('Year')
plt.ylabel('Mortality')
plt.tight_layout()
plt.show()
print(df3)
The output is:
Edit: What I would like to achieve (made from excel):
Questions:
How can I make my x-axis or y-axis start from zero?
Research:
I have looked into the Panda API and I noticed that they have no actual solution for my problem.
Your problem is that your dataframe columns Year and Mortality are strings, and matplotlib doesn't work so well with that, at least for what you're trying to achieve. So, convert them to int before plotting:
df = pd.DataFrame(dataset)
df[['Year', 'Mortality']] = df[['Year', 'Mortality']].astype(int)
plt.bar(df['Year'], df['Mortality'], tick_label=df['Year'])
plt.title('Zimbabwe')
plt.xticks(df['Year'], rotation=90)
plt.xlabel('Year')
plt.ylabel('Mortality')
plt.tight_layout()
plt.show()
With your given data, the y axis is automatically set to start at 0. However, you can change it to whatever you want with the ylim argument: For instance, plt.ylim(500, 50000) would set the y axis to start at 500 and end at 50000.

plotting data for different days on a single HH:MM:SS axis

The DataFrame has timestamped data and I want to visually compare the daily temporal evolution of the data. If I groupby day and plot the graphs; they are obviously displaced horizontaly in time due to differences in their dates.
I want to plot a date agnostic graph of the day wise trends on a time only axis. Towards that end I have resorted to shifting the data back by an appropriate number of days as demonstrated in the following code
import pandas as pd
import datetime
import matplotlib.pyplot as plt
index1 = pd.date_range('20141201', freq='H', periods=2)
index2 = pd.date_range('20141210', freq='2H', periods=4)
index3 = pd.date_range('20141220', freq='3H', periods=5)
index = index1.append([index2, index3])
df = pd.DataFrame(list(range(1, len(index)+1)), index=index, columns=['a'])
gbyday = df.groupby(df.index.day)
first_day = gbyday.keys.min() # convert all data to this day
plt.figure()
ax = plt.gca()
for n,g in gbyday:
g.shift(-(n-first_day+1), 'D').plot(ax=ax, style='o-', label=str(n))
plt.show()
resulting in the following plot
Question: Is this the pandas way of doing it? In other words how can I achieve this more elegantly?
You can select the hour attribute of the index after grouping like this:
In [36]: fig, ax = plt.subplots()
In [35]: for label, s in gbyday:
....: ax.plot(s.index.hour, s, 'o-', label=label)
It might be a little too late for this answer, but in case anyone is still looking for it.
This solution works on different months (it was an issue if using the code from the original question) and keeps fractional hours.
import pandas as pd
import matplotlib.pyplot as plt
index0 = pd.date_range('20141101', freq='H', periods=2)
index1 = pd.date_range('20141201', freq='H', periods=2)
index2 = pd.date_range('20141210', freq='2H', periods=4)
index3 = pd.date_range('20141220', freq='3H', periods=5)
index = index1.append([index2, index3, index0])
df = pd.DataFrame(list(range(1, len(index)+1)), index=index, columns=['a'])
df['time_hours'] = (df.index - df.index.normalize()) / pd.Timedelta(hours=1)
fig, ax = plt.subplots()
for n,g in df.groupby(df.index.normalize()):
ax.plot(g['time_hours'], g['a'], label=n, marker='o')
ax.legend(loc='best')
plt.show()

Categories

Resources