I have a data-set of sleep-time information and would like to produce a visualisation of it using Python.
The .csv data-set I have looks like the this:
SleepStartDate,SleepStartTime,SleepStopTime
17/03/2017,23:45,07:25
19/03/2017,01:05,09:10
19/03/2017,23:50,08:25
The visualisation I want to produce should be similar to the following:
Image source: http://quantifiedself.com/wp-content/uploads/2015/08/qs2.png
I know this is a really simple visualisation, and imagine that it's built into some already existing library, but my best Googling efforts have been unable to locate it. I'd much appreciate it if someone could point me in the right direction.
Thanks in advance for your time and wisdom.
Wasn't able to find a library with functionality to do what I wanted, so ended up writing a script to do it for myself:
Script:
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import matplotlib.patches as patches
import datetime as dt
import csv
import sys
MINUTES_IN_DAY = 1440.0
COLUMN_COLOUR = 'b'
# Graph data using matplotlib visualization
def plotData(data,columnColour,maxDate,minDate):
# Set up an invisible background scatterplot give graph the correct size
# Make a series of events that are one day apart
x = mpl.dates.drange(minDate,maxDate,dt.timedelta(days=1))
# Offset first event to top of graph to give correct height
x[0] += 0.85
# Extract the time using a modulo 1, and adding an arbitrary base date
# int used so that y-axis starts at midnight
times = x % 1 + int(x[0])
fig = plt.figure()
fig.suptitle('Daily Sleep Patterns', fontsize=14, fontweight='bold')
ax = fig.add_subplot(111)
# Set background scatterplot to invisible
ax.plot_date(x, times, 'ro', color='w', visible=False)
ax.yaxis_date()
fig.autofmt_xdate()
start, end = ax.get_ylim()
# Fix division sizes and labels to show hours on y-axis
hourDivision = 1.0 / 24.0
ax.yaxis.set_ticks(np.arange(start,end,hourDivision))
ax.set_yticklabels(['Midnight','1am','2am','3am','4am','5am','6am','7am','8am','9am','10am','11am','Midday','1pm','2pm','3pm','4pm','5pm','6pm','7pm','8pm','9pm','10pm','11pm','Midnight'])
# Iterate through data
for i in range(0,len(data)):
# If period starts and finishes on different days, slit and add to both days
if data[i].startTime > data[i].stopTime:
currentDataItem = data[i]
currentDate = dt.datetime(currentDataItem.year,currentDataItem.month,currentDataItem.day)
currentDate -= dt.timedelta(days=0.5)
tomorrow = currentDate + dt.timedelta(days=1)
plt.axvspan(xmin=currentDate, xmax=tomorrow, ymin=currentDataItem.startTime, ymax=1, facecolor=columnColour, alpha=0.5)
theDayAfterTomorrow = tomorrow + dt.timedelta(days=1)
plt.axvspan(xmin=tomorrow, xmax=theDayAfterTomorrow, ymin=0, ymax=currentDataItem.stopTime, facecolor=columnColour, alpha=0.5)
# Else, add to given day
else:
currentDataItem = data[i]
currentDate = dt.datetime(currentDataItem.year,currentDataItem.month,currentDataItem.day)
currentDate -= dt.timedelta(days=0.5)
tomorrow = currentDate + dt.timedelta(days=1)
plt.axvspan(xmin=currentDate, xmax=tomorrow, ymin=currentDataItem.startTime, ymax=currentDataItem.stopTime, facecolor=columnColour, alpha=0.5)
ax.set_ylabel('Hours',fontweight='bold')
#ax.legend()
ax.grid(True)
plt.show()
# Read data from csv file
def readDataFromFile(dataFile):
f = open(dataFile,'rt')
listOfInputLists = []
try:
reader = csv.reader(f)
for row in reader:
listOfInputLists.append(row)
finally:
f.close()
return listOfInputLists
# Class to store time and date data read from file
class sleepInstance(object):
def __init__(self,listOfInputLists):
self.day = 0
self.month = 0
self.year = 0
self.formatDate(listOfInputLists[0])
self.startTime = self.formatTime(listOfInputLists[1])
self.stopTime = self.formatTime(listOfInputLists[2])
# Extracts date information variables
def formatDate(self,unformattedDate):
date = dt.datetime.strptime(unformattedDate,"%d/%m/%y")
self.day = int(date.strftime("%d"))
self.month = int(date.strftime("%m"))
self.year = int(date.strftime("%Y"))
# Formats time as a decimal fraction of day, for use in graph
def formatTime(self,unformattedTime):
timeSinceMidnight = dt.datetime.strptime(unformattedTime,'%H:%M:%S')
midnight = dt.datetime(1900,1,1)
minutesSinceMidnight = ((timeSinceMidnight - midnight).total_seconds() / 60.0)
fractionOfDay = minutesSinceMidnight / MINUTES_IN_DAY
return fractionOfDay
# Formats data read from file as a list of sleepInstance objects
def formatDataForPlot(listOfInputLists):
sleeps = []
for i in range(1,len(listOfInputLists)):
sleeps.append(sleepInstance(listOfInputLists[i]))
return sleeps
# Extracts earliest (min) and latest (max) dates from data, for use in setting graph limits
def getMaxAndMinDates(plotDataList):
dateTimeList = []
for item in plotDataList:
nextDate = dt.datetime(item.year,item.month,item.day)
dateTimeList.append(nextDate)
maxDate = max(dateTimeList)
minDate = min(dateTimeList)
return maxDate, minDate
dataFile = 'sleepData.csv'
listOfInputLists = readDataFromFile(dataFile)
plotDataList = formatDataForPlot(listOfInputLists)
maxDate, minDate = getMaxAndMinDates(plotDataList)
plotData(plotDataList,COLUMN_COLOUR,maxDate,minDate)
Input:
Date,Start,Finish
17/03/17,03:15:00,03:55:00
17/03/17,06:20:00,06:35:00
17/03/17,09:00:00,09:40:00
17/03/17,13:10:00,13:35:00
17/03/17,15:45:00,16:30:00
17/03/17,18:45:00,19:25:00
17/03/17,21:15:00,21:35:00
18/03/17,00:30:00,02:00:00
18/03/17,04:50:00,05:05:00
18/03/17,08:20:00,08:40:00
18/03/17,12:30:00,13:10:00
18/03/17,16:30:00,17:00:00
18/03/17,18:45:00,19:00:00
18/03/17,20:30:00,21:00:00
19/03/17,00:00:00,12:00:00
19/03/17,18:00:00,23:59:00
19/03/17,13:00:00,14:00:00
20/03/17,12:00:00,11:00:00
Output:
May fancy it up a bit when I have time: https://github.com/ambidextrous/timeLogGrapher
The classic Python choice would be to use the matplotlib package. Looking at your sample graph it looks like a vertical bar graph.
Related
From the code given here, I have developed another code which uses Matplotlib in place of Seaborn (The data are plotted on several figures and subplots, and so are now more readable and I am closer to the point I want to reach: the user by putting the cursor over a point has access to all the information of the point, in particular the datetime.)
Here it is:
import pandas as pd
import numpy as np
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import random
from datetime import datetime
# size of the database
n = 1000
nA = 4
nB = 9
no = np.arange(n)
date = np.random.randint(1e9, size=n).astype('datetime64[s]')
A = [''.join(['A',str(random.randint(1, nA))]) for j in range(n)]
B = [''.join(['B',str(random.randint(1, nB))]) for j in range(n)]
Epsilon1 = np.random.random_sample((n,))
Epsilon2 = np.random.random_sample((n,))
Epsilon3 = np.random.random_sample((n,))
data = pd.DataFrame({'no':no,
'Date':date,
'A':A,
'B':B,
'Epsilon1':Epsilon1,
'Epsilon2':Epsilon2,
'Epsilon3':Epsilon3})
def format_coord(x, y):
string_x = datetime.utcfromtimestamp(x).strftime("%m/%d/%Y, %H:%M:%S")
return 'x={}, y={:.4f}'.format(string_x,y)
def plot_Epsilon_matplotlib():
for A in data['A'].sort_values().drop_duplicates().to_list():
n_col = 2
fig, axes = plt.subplots(np.ceil(nB/n_col).astype(int),n_col)
for j, B in enumerate(data['B'].sort_values().drop_duplicates().to_list()):
df = data.loc[(data['A']==A) & (data['B']==B)]
df = df.sort_values("Date", ascending=True)
axes.flatten()[j].plot(df["Date"],df['Epsilon1'],marker='x',c='b',label="Epsilon1")
axes.flatten()[j].plot(df["Date"],df['Epsilon2'],marker='x',c='r',label="Epsilon2")
axes.flatten()[j].plot(df["Date"],df['Epsilon3'],marker='x',c='g',label="Epsilon3")
axes.flatten()[j].format_coord = format_coord
if __name__ == '__main__':
plot_Epsilon_matplotlib()
The goal is that when the user puts the cursor over a point, he gets access to the full datetime of the data.
I have first tried to change the major formatter (as here):
axes.flatten()[j].xaxis.set_major_formatter(mdates.DateFormatter('%Y/%m/%d %H:%M:%S'))
but then the x ticks are not readable (especially if the user zooms on a subplot)
I then tried the define my own format_coord as here. My first try is given in the full code given above. The format of the datetime in Matplotlib figure status bar is good but the date remains in 1970 !
After reading this discussion, I realized this problem relates on Numpy datetime64 to Datetime conversion. I then coded this new version of format_coord (strongly inspired from this answer):
def format_coord_bis(x,y):
dt64 = np.datetime64(datetime.utcfromtimestamp(x))
unix_epoch = np.datetime64(0, 's')
one_second = np.timedelta64(1, 's')
seconds_since_epoch = (dt64 - unix_epoch) / one_second
string_x = datetime.utcfromtimestamp(seconds_since_epoch).strftime("%m/%d/%Y, %H:%M:%S")
return 'x={}, y={:.4f}'.format(string_x,y)
but the date given in the status bar remains the 01/01/1970...
I have found the solution from this answer.
The function format_coord() should be defined as follows:
def format_coord(x, y):
string_x = matplotlib.dates.num2date(x).strftime('%Y-%m-%d %H:%M:%S')
return 'x={}, y={:.4f}'.format(string_x,y)
I have a function that reads csv and outputs me 12 graphs. But it displays the time with a very small interval.
Here's the function!
def Gr():
df = pd.read_csv('DataSet.csv',)
'''start = df['Time'].iloc[0]
start = str(start)
start1 = start.replace(':', '-')
end = df['Time'].iloc[-1]
end = str(end)
end1 = end.replace(':', '-')
index = pd.date_range(start = start1, end = end1, freq = "S")
index = [pd.to_datetime(date, format='%H:%M:%S').date() for date in index] '''
names = ['P', 'Filter', 'Answers', 'step','step2','Comulative', 'Delta_ema','ComulativePOC', 'Delta_P', 'Sum','SpeedUp', 'M' ]
features = df[names]
features.index = df['Time']
axs = features.plot(subplots=True)
cursor = MultiCursor(axs[1].get_figure().canvas, axs)
plt.subplots_adjust(wspace=0.19, hspace=0.05, top=0.99, right=0.988, bottom=0.052, left=0.055)
plt.show()
Here is a screenshot of the result of the functions. I circled the time at the bottom. I would like to increase the interval to at least once every 5 seconds or even 1 second.
Is it possible to do it this way ? Without a figure?
you should uses something like this before your plt.show() line
import matplotlib.dates as m_dates
ax = plt.gca() # get the current axis
ax.xaxis.set_major_locator(m_dates.SecondLocator(interval=5)) # every five seconds
info about SecondLocator:
https://matplotlib.org/stable/api/dates_api.html#matplotlib.dates.SecondLocator
info dates in general:https://matplotlib.org/stable/api/dates_api.html
info ticks locating: https://matplotlib.org/stable/api/ticker_api.html?highlight=ticks%20locator
i'm trying to plot a simple bar chart of the average 'resale_price' (y-axis) for each flat type against 'town' (x-axis) for data from 2015-2019. However for some reason, my x-axis keeps changing every time I re-run my code. Not sure where I've gone wrong.
dataset: https://data.gov.sg/dataset/resale-flat-prices
here's the code i've used below
labels1 = list(set(data_3room['town']))
town1 = np.arange(0,len(labels1))
town1_values = data_3room[['town','resale_price']]
values1 = town1_values['resale_price']/1000
# print(values1)
avg_values1 = {}
for i in labels1:
valuesfortown1 = values1[town1_values['town']==i]
avg1 = np.average(valuesfortown1)
print("Average 3 Room Resale Price for town " + i + " is {:.0f}".format(avg1))
avg_values1[i] = avg1
from collections import OrderedDict
from operator import itemgetter
avg_values1 = OrderedDict(sorted(avg_values1.items(), key = itemgetter(1), reverse = True))
plt.figure(1, figsize=(30,30))
barchart1 = plt.bar(list(avg_values1.keys()), list(avg_values1.values()), color='#d62728')
for i in range(len(barchart1)):
bar1 = barchart1[i]
x1,y1 = bar1.get_xy()
h1 = bar1.get_height()
plt.text(x1,h1,"{:.0f}".format(list(avg_values1.values())[i]),fontsize=30)
plt.title('3 Room Resale Prices by Town',fontsize=40)
plt.ylabel('Resale Prices (Thousands)',fontsize=40)
plt.yticks(fontsize=20)
plt.xticks(town1, labels1, fontsize=40,rotation='vertical')
I guess this is supposed to be simple.. But I cant seem to make it work.
I have some stock data
import pandas as pd
import numpy as np
df = pd.DataFrame(index=pd.date_range(start = "06/01/2018", end = "08/01/2018"),
data = np.random.rand(62)*100)
I am doing some analysis on it, this results of my drawing some lines on the graph.
And I want to plot a 45 line somewhere on the graph as a reference for lines I drew on the graph.
What I have tried is
x = df.tail(len(df)/20).index
x = x.reset_index()
x_first_val = df.loc[x.loc[0].date].adj_close
In order to get some point and then use slope = 1 and calculate y values.. but this sounds all wrong.
Any ideas?
Here is a possibility:
import pandas as pd
import numpy as np
df = pd.DataFrame(index=pd.date_range(start = "06/01/2018", end = "08/01/2018"),
data=np.random.rand(62)*100,
columns=['data'])
# Get values for the time:
index_range = df.index[('2018-06-18' < df.index) & (df.index < '2018-07-21')]
# get the timestamps in nanoseconds (since epoch)
timestamps_ns = index_range.astype(np.int64)
# convert it to a relative number of days (for example, could be seconds)
time_day = (timestamps_ns - timestamps_ns[0]) / 1e9 / 60 / 60 / 24
# Define y-data for a line:
slope = 3 # unit: "something" per day
something = time_day * slope
trendline = pd.Series(something, index=index_range)
# Graph:
df.plot(label='data', alpha=0.8)
trendline.plot(label='some trend')
plt.legend(); plt.ylabel('something');
which gives:
edit - first answer, using dayofyear instead of the timestamps:
import pandas as pd
import numpy as np
df = pd.DataFrame(index=pd.date_range(start = "06/01/2018", end = "08/01/2018"),
data=np.random.rand(62)*100,
columns=['data'])
# Define data for a line:
slope = 3 # unit: "something" per day
index_range = df.index[('2018-06-18' < df.index) & (df.index < '2018-07-21')]
dayofyear = index_range.dayofyear # it will not work around the new year...
dayofyear = dayofyear - dayofyear[0]
something = dayofyear * slope
trendline = pd.Series(something, index=index_range)
# Graph:
df.plot(label='data', alpha=0.8)
trendline.plot(label='some trend')
plt.legend(); plt.ylabel('something');
EDIT: I figured out that the Problem always occours if one tries to plot to two different lists of figures. Does that mean that one can not do plots to different figure-lists in the same loop? See latest code for much simpler sample of a problem.
I try to analyze a complex set of data which consists basically about measurements of electric devices under different conditions. Hence, the code is a bit more complex but I tried to strip it down to a working example - however it is still pretty long. Hence, let me explain what you see: You see 3 classes with Transistor representing an electronic device. It's attribute Y represents the measurement data - consisting of 2 sets of measurements. Each Transistor belongs to a group - 2 in this example. And some groups belong to the same series - one series where both groups are included in this example.
The aim is now to plot all measurement data for each Transistor (not shown), then to also plot all data belonging to the same group in one plot each and all data of the same series to one plot. In order to program it in an efficent way without having a lot of loops my idea was to use the object orientated nature of matplotlib - I will have figures and subplots for each level of plotting (initialized in initGrpPlt and initSeriesPlt) which are then filled with only one loop over all Transistors (in MainPlt: toGPlt and toSPlt). In the end it should only be printed / saved to a file / whatever (PltGrp and PltSeries).
The Problem: Even though I specify where to plot, python plots the series plots into the group plots. You can check this yourself by running the code with the line 'toSPlt(trans,j)' and without. I have no clue why python does this because in the function toSPlt I explicetly say that python should use the subplots from the series-subplot-list. Would anyone have an idea to why this is like this and how to solve this problem in an elegent way?
Read the code from the bottom to the top, that should help with understanding.
Kind regards
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
maxNrVdrain = 2
X = np.linspace(-np.pi, np.pi, 256,endpoint=True)
A = [[1*np.cos(X),2*np.cos(X),3*np.cos(X),4*np.cos(X)],[1*np.tan(X),2*np.tan(X),3*np.tan(X),4*np.tan(X)]]
B = [[2* np.sin(X),4* np.sin(X),6* np.sin(X),8* np.sin(X)],[2*np.cos(X),4*np.cos(X),6*np.cos(X),8*np.cos(X)]]
class Transistor(object):
_TransRegistry = []
def __init__(self,y1,y2):
self._TransRegistry.append(self)
self.X = X
self.Y = [y1,y2]
self.group = ''
class Groups():
_GroupRegistry = []
def __init__(self,trans):
self._GroupRegistry.append(self)
self.transistors = [trans]
self.figlist = []
self.axlist = []
class Series():
_SeriesRegistry = []
def __init__(self,group):
self._SeriesRegistry.append(self)
self.groups = [group]
self.figlist = []
self.axlist = []
def initGrpPlt():
for group in Groups._GroupRegistry:
for j in range(maxNrVdrain):
group.figlist.append(plt.figure(j))
group.axlist.append(group.figlist[j].add_subplot(111))
return
def initSeriesPlt():
for series in Series._SeriesRegistry:
for j in range(maxNrVdrain):
series.figlist.append(plt.figure(j))
series.axlist.append(series.figlist[j].add_subplot(111))
return
def toGPlt(trans,j):
colour = cm.rainbow(np.linspace(0, 1, 4))
group = trans.group
group.axlist[j].plot(trans.X,trans.Y[j], color=colour[group.transistors.index(trans)], linewidth=1.5, linestyle="-")
return
def toSPlt(trans,j):
colour = cm.rainbow(np.linspace(0, 1, 2))
series = Series._SeriesRegistry[0]
group = trans.group
if group.transistors.index(trans) == 0:
series.axlist[j].plot(trans.X,trans.Y[j],color=colour[series.groups.index(group)], linewidth=1.5, linestyle="-", label = 'T = nan, RH = nan' )
else:
series.axlist[j].plot(trans.X,trans.Y[j],color=colour[series.groups.index(group)], linewidth=1.5, linestyle="-")
return
def PltGrp(group,j):
ax = group.axlist[j]
ax.set_title('Test Grp')
return
def PltSeries(series,j):
ax = series.axlist[j]
ax.legend(loc='upper right', frameon=False)
ax.set_title('Test Series')
return
def MainPlt():
initGrpPlt()
initSeriesPlt()
for trans in Transistor._TransRegistry:
for j in range(maxNrVdrain):
toGPlt(trans,j)
toSPlt(trans,j)#plots to group plot for some reason
for j in range(maxNrVdrain):
for group in Groups._GroupRegistry:
PltGrp(group,j)
plt.show()
return
def Init():
for j in range(4):
trans = Transistor(A[0][j],A[1][j])
if j == 0:
Groups(trans)
else:
Groups._GroupRegistry[0].transistors.append(trans)
trans.group = Groups._GroupRegistry[0]
Series(Groups._GroupRegistry[0])
for j in range(4):
trans = Transistor(B[0][j],B[1][j])
if j == 0:
Groups(trans)
else:
Groups._GroupRegistry[1].transistors.append(trans)
trans.group = Groups._GroupRegistry[1]
Series._SeriesRegistry[0].groups.append(Groups._GroupRegistry[1])
return
def main():
Init()
MainPlt()
return
main()
latest example that does not work:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
X = np.linspace(-np.pi, np.pi, 256,endpoint=True)
Y1 = np.cos(X)
Y2 = np.sin(X)
figlist1 = []
figlist2 = []
axlist1 = []
axlist2 = []
for j in range(4):
figlist1.append(plt.figure(j))
axlist1.append(figlist1[j].add_subplot(111))
figlist2.append(plt.figure(j))#this should be a new set of figures!
axlist2.append(figlist2[j].add_subplot(111))
colour = cm.rainbow(np.linspace(0, 1, 4))
axlist1[j].plot(X,j*Y1, color=colour[j], linewidth=1.5, linestyle="-")
axlist1[j].set_title('Test Grp 1')
colour = cm.rainbow(np.linspace(0, 1, 4))
axlist2[j].plot(X,j*Y2, color=colour[int(j/2)], linewidth=1.5, linestyle="-")
axlist2[j].set_title('Test Grp 2')
plt.show()
Ok, stupid mistake if one thinks of the Background but maybe someone has a similar Problem and is unable to see the cause as I was first. So here is the solution:
The Problem is that the Name of the listobjects like figlist1[j] do not define the figure - they are just pointers to the actual figure object. and if such an object is created by plt.figure(j) one has to make sure that j is different for each figure - hence, in a Loop where multiple figures shall be initialized one Needs to somehow Change the number of the figure or the first object will be overwritten. Hope that helps! Cheers.