Plotting Date vs time from files - python

I have folder names which are date time formated as
2018-08-21 to 2018-10-16
Inside each folder there is a zip files which contains time values which is a linear scale, the time goes up linearly.
I'm trying to plot for each day, which has a lot of .bz2 file time series data, the time value at that date.
Right now I'm trying to do that:
timearr = np.asarray(data1['time'])
ax.plot(np.asarray(timeStamps), timearr)
ax.set_title('title')
ax.set_ylabel('date vs time ')
ax.grid(True)
# Format the x-axis for dates (label formatting, rotation)
fig.autofmt_xdate(rotation=45)
fig.tight_layout()
plt.show()
but I get an error message, that both dimensions doesn't match.
dateStamps are list[2018-08-21
2018-08-22
2018-08-23
2018-08-24
2018-08-25]
data1['time'] = list of EPOC values.

Unfortunately, I don't know the detailed file structure, so I have to guess a little were the problem actually is
Here is some code to generate some folders with generic bz2 files:
import bz2
import numpy as np
import datetime
import os
startDate = datetime.datetime(2000,5,2,10,15,0,0)
for day in range(5):
theDate = startDate + datetime.timedelta(days=day)
folder = "{}".format( theDate.replace( microsecond = 0 ).strftime("%Y-%m-%d") )
os.mkdir( folder )
data = ""
for k in range(100):
zzz = theDate + datetime.timedelta(seconds=137*k)
data += "{} ".format( zzz.replace( microsecond = 0 ).strftime("%H:%M:%S") )
d = zzz.day
m = zzz.minute
data += " {}\n".format( .17 * d + .003 * m**2 -.001 * m )
myZip = bz2.BZ2File(os.path.join( folder, 'dat.bz2' ), 'w' )
myZip.write( data )
myZip.close()
Those folders and files a treat with:
import bz2
import numpy as np
import datetime
import os
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
"""
SE posts I used
https://stackoverflow.com/questions/1574088/plotting-time-in-python-with-matplotlib
https://stackoverflow.com/questions/11264521/date-ticks-and-rotation-in-matplotlib
"""
def split_data( inData ):
rows=data.strip().split('\n')
rowcol = [x.split() for x in rows ]
x,y = zip(*rowcol)
y = [float(z) for z in y ]
x = [ datetime.datetime.strptime(z, '%H:%M:%S') for z in x]
return x,y
dataDict = dict()
for root, dirs, files in os.walk("."):
for name in files:
if name.split('.')[-1]=='bz2':
base = os.path.basename( root )
myPath = (os.path.join(root, name))
bz = bz2.BZ2File( myPath, 'r' )
data = bz.read()
dataDict[ base ] = split_data( data )
myFmt = mdates.DateFormatter('%H:%M')
fig = plt.figure()
ax = fig.add_subplot( 1, 1, 1 )
for key, dt in dataDict.iteritems():
ax.plot( *dt , label=key )
ax.xaxis.set_major_formatter(myFmt)
for label in ax.get_xmajorticklabels():
label.set_rotation(30)
ax.set_ylabel('data (arb. u.)')
ax.set_xlabel('time')
ax.legend( loc=0 )
plt.tight_layout()
plt.show()
Providing:
Hope I got it right.

Related

Image saving with plt needs more and more time

I wrote a simple program to convert .wav to spectogram and save this as an png.
Here you go:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io.wavfile as wavfile
import os
import time as t
DATAPATH = 'dataset' #path
CATEGORIES = ['zero','one','two','three','four','five','six','seven','eight','nine']
for categorie in CATEGORIES:
path = DATAPATH + '/' + categorie + '/'
filenames = os.listdir(path) #get all filenames in categorie
print(categorie)
i = 0
for file in filenames[:100]:
start = t.time()
Fs, aud = wavfile.read(path + file)
powerSpectrum, frequenciesFound, time, imageAxis = plt.specgram(aud, Fs=Fs)
plt.subplots_adjust(left=0, right=1, bottom=0, top=1) #cut axis
plt.axis('off')
plt.savefig('pics/' + categorie + '/' + str(i) + '.png')
ende = t.time()
print(i, str(ende-start)+'s')
i += 1
The problem is that the time per image getiing higher and higher (only for a few milisekonds) but at the thousand pic it will be like 10sek per pic. Thats why I stopp the time and print it out. Some solutions?
FTR, the solution seems to be cleaning the plot after every iteration using plt.clf():
for categorie in CATEGORIES:
# ...
for file in filenames[:100]:
# ...
# plt.savefigs(...)
plt.clf()
# ...

plotting with drawnow in python

I am trying to monitor sensor data in real-time, but the plot shows nothing, below is just an example.
can anyone explain to me how come the result is showing nothing?
import datetime
import random
import matplotlib.pyplot as plt
from drawnow import *
from matplotlib.dates import AutoDateLocator, AutoDateFormatter, date2num
i = 0
x = 0
y = 0
FirstTime = str('00:00')
LastTime = str('00:00')
def CreatePlot():
figure = plt.subplot()
plt.plot([],[])
date_datetime = datetime.datetime.strptime(LastTime, '%H:%M')
int_date = date2num( date_datetime)
locator = AutoDateLocator()
figure.xaxis.set_major_locator(locator)
figure.xaxis.set_major_formatter( AutoDateFormatter(locator) )
min_date = date2num( datetime.datetime.strptime(FirstTime, '%H:%M') )
max_date = date2num( datetime.datetime.strptime(LastTime, '%H:%M') )
plt.xlim(min_date, max_date)
plt.plot(x,y, 'r-')
plt.gcf().autofmt_xdate()
while True:
x = datetime.datetime.now() + datetime.timedelta(minutes=i)
x = datetime.datetime.strftime(x,'%H:%M')
if i == 0:
FirstTime = x
else:
LastTime = x
y = (2*i)+2
if i>500:
break
else:
drawnow(CreatePlot)
plt.pause(0.0001)
i+=1
I solved the issue, so I am gonna explain it to help someone else like me,
the first issue is changing date format to string with strftime,
plotting string in x-axis is not auto-formattable,
also following commands are redundant:
min_date = date2num( datetime.datetime.strptime(FirstTime, '%H:%M') )
max_date = date2num( datetime.datetime.strptime(LastTime, '%H:%M') )
plt.xlim(min_date, max_date)
in addition, to make a better view someone can add the following commands too:
from matplotlib.ticker import AutoMinorLocator
from matplotlib.dates import AutoDateLocator, AutoDateFormatter
.
.
.
.
ax0 = plt.subplot(2,2,1)
locator = AutoDateLocator()
ax0.xaxis.set_major_locator(locator)
formatter = AutoDateFormatter(locator)
ax0.xaxis.set_major_formatter(formatter)
ax0.xaxis.set_minor_locator(AutoMinorLocator())

Cutting .wav file into segments with the same length

i want to cut an .wav file into multiple segments with the same length.
I found this code: https://gist.github.com/kylemcdonald/c8e62ef8cb9515d64df4
But its splitted into parts based on onset detection with librosa. I assume that the answer to my question is simple, but i would appreciate any help.
That's the code i used with Python 3.7.6 on Ubuntu (in conda):
import matplotlib
import matplotlib.pyplot as plt # For displaying the output
import librosa
import numpy as np # For some mathematical operations
from glob import glob # To grab files
import os
# Set directory for cutted files
save_dir = './cut_4s'
### Load the audio_file
data_dir = './' # Set Path, in this case it looks at the path where this python file is
audio_files = glob(data_dir + '/*.wav') # Grab audio files (.wav) in the data_dir
found = len(audio_files)
print("Audiofiles found: " + str(found))
input("Press Enter to continue...")
y, sr = librosa.load(audio_files[0])
length = librosa.get_duration(y=y, sr=sr) # Get the length of the file
time = np.arange(0, len(y)) / sr # Create the time array (timeline)
print(str(length))
# Plot audio over time
fig, ax = plt.subplots()
ax.plot(time, y)
ax.set(xlabel='Time (s)', ylabel='Sound Amplitude')
plt.show()
C = np.abs(librosa.cqt(y=y, sr=sr))
o_env = librosa.onset.onset_strength(sr=sr, S=librosa.amplitude_to_db(C, ref=np.max))
#o_env = librosa.onset.onset_strength(y, sr=sr, feature=librosa.cqt)
onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr)
def prepare(y, sr=22050):
y = librosa.to_mono(y)
y = librosa.util.fix_length(y, sr) # 1 second of audio
y = librosa.util.normalize(y)
return y
def get_fingerprint(y, sr=22050):
y = prepare(y, sr)
cqt = librosa.cqt(y, sr=sr, hop_length=2048)
return cqt.flatten('F')
def normalize(x):
x -= x.min(axis=0)
x /= x.max(axis=0)
return x
def basename(file):
file = os.path.basename(file)
return os.path.splitext(file)[0]
vectors = []
words = []
filenames = []
onset_samples = list(librosa.frames_to_samples(onset_frames))
onset_samples = np.concatenate(onset_samples, len(y))
starts = onset_samples[0:-1]
stops = onset_samples[1:]
samples_folder = os.path.join(data_dir, 'samples')
try:
os.makedirs(samples_folder)
except:
pass
for i, (start, stop) in enumerate(zip(starts, stops)):
audio = y[start:stop]
filename = os.path.join(samples_folder, str(i) + '.wav')
librosa.output.write_wav(filename, audio, sr)
vector = get_fingerprint(audio, sr=sr)
word = basename(filename)
vectors.append(vector)
words.append(word)
filenames.append(filename)
np.savetxt(os.path.join(save_dir, 'vectors'), vectors, fmt='%.5f', delimiter='\t')
np.savetxt(os.path.join(save_dir, 'words'), words, fmt='%s')
np.savetxt(os.path.join(save_dir, 'filenames.txt'), filenames, fmt='%s')```

This script in python 2.x worked and made multiple plots in one run. Now, in python 3.x, it only makes one plot

I wrote this about a year ago in python 2.x. I have since switched over to 3.x, and I am not getting any errors, but as mentioned above, only one plot is being outputted, the first stdev one. If I use #%% in Spyder to make the code under ...##mean plot##... into its own cell and run the cell, it will make that plot. If I switch the order of the mean and stdev code, then it will just make the one that comes first. I have many similar scripts and they all are having the same issue.
Is it the indentation? I've played around with it to no avail.
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 28 09:16:29 2017
#author: kmiranda
"""
import numpy as np
import matplotlib.pyplot as plt
import mpl_toolkits.basemap as bm
##from ens_function import ens_data, grd_data
from scipy.io import netcdf
for date in np.arange(3,16):
mem=0o01
DTG = '201501%2.2d00' % (date)
nfdir='/u/prob/coupled/rowley/pertobs/kmiranda/scratch/gom/ensemble/post'
ncfn=nfdir + '/ncom_relo_gom_' + DTG + '.%3.3d'%mem + '.nc'
nc1 = netcdf.netcdf_file(ncfn, 'r')
lat = nc1.variables['lat'][:]
lon = nc1.variables['lon'][:]
dep = nc1.variables['depth'][:]
tau = nc1.variables['tau'][:] ## added this. it's probably not right
water_temp = nc1.variables['water_temp']
t0 = np.squeeze(water_temp[1,:,:,:])*1. ##I changed the 0 to 1 here
t0[np.where(t0 == -30000)]=np.nan
bbox=[262.,281.,18.,31.]
ma = bm.Basemap(projection='merc',llcrnrlat=bbox[2],urcrnrlat=bbox[3], \
llcrnrlon=bbox[0],urcrnrlon=bbox[1],lat_ts=0,resolution='i')
s1 = np.zeros(t0.shape)
s2 = np.zeros(t0.shape)
#del lat, lon, dep, water_temp, t0
#nc1.close()
for i in np.arange(1,33):
DTG = '201501%2.2d00' % (date)
nfdir='/u/prob/coupled/rowley/pertobs/kmiranda/scratch/gom/ensemble/post'
ncfn=nfdir + '/ncom_relo_gom_' + DTG + '.%3.3d' %i + '.nc'
nc1 = netcdf.netcdf_file(ncfn, 'r')
water_temp = nc1.variables['water_temp']
t0 = np.squeeze(water_temp[1,:,:,:])*1. ##I changed the 0 to 1 here
t0[np.where(t0 == -30000)]=np.nan
temp = np.ma.masked_invalid(np.squeeze(t0)*water_temp.scale_factor + water_temp.add_offset)
temp.min()
n = i
s1 = s1 + np.power(temp,2)
s2 = s2 + temp
s2.min()
for k in np.arange(0,21,20):
std = np.sqrt((((n * s1)-(np.power(s2,2)))/(n**2)))
mean = s2/n
mean = np.squeeze(mean[k, :, :])
std = np.squeeze(std[k, :, :])
################ stdev plot ####################
x, y = np.meshgrid(lon, lat)
p1 = ma.pcolormesh(x, y, std, shading = 'flat', \
cmap = plt.cm.jet, latlon = True, vmin=0, vmax=0.8)
c = ma.colorbar()
#degree = u'\xb0'
#c.set_label("C%s" % (degree))
#m.colorbar(p1)
ma.drawcoastlines()
ma.fillcontinents(color='#EDE0BE')
ma.drawparallels(np.arange(20., 30., 5.))
ma.drawmeridians(np.arange(270., 280., 5.))
ma.drawcoastlines(linewidth=0.25)
ma.drawcountries(linewidth=0.25)
plt.title('Std Temperature %s %dm' % (DTG, dep[k]))
fig_name =nfdir + '/temp_std_z%06.1f' % dep[k] + '_ncom_relo_gom_' + DTG + '.%3.3d' % int(tau[1]) + '.png'
plt.savefig(fig_name , dpi=200)
plt.show()
plt.clf()
########### mean plot ###################
if k == 0:
vmin =10
vmax = 30
else:
vmin = 10
vmax = 28
p2 = ma.pcolormesh(x, y, mean, shading = 'flat', \
cmap=plt.cm.jet, latlon= True, vmin=vmin, vmax=vmax)
c2 = plt.colorbar()
degree = u'\xb0'
c2.set_label("C%s" % (degree))
ma.drawcoastlines()
ma.fillcontinents(color='#EDE0BE')
ma.drawparallels(np.arange(20., 30., 5.))
ma.drawmeridians(np.arange(270., 280., 5.))
ma.drawcoastlines(linewidth=0.25)
ma.drawcountries(linewidth=0.25)
plt.title('Mean Temperature %s %dm' % (DTG, dep[k]))
fig_name =nfdir + '/temp_avg_z%06.1f' % dep[k] + '_ncom_relo_gom_' + DTG + '.%3.3d' % int(tau[1]) + '.png'
plt.savefig(fig_name , dpi=200)
plt.show()
plt.clf()

Very slow plot with Matlpotlib

Can anybody help how to optimize the plot function in python? I use Matplotlib to plot financial data.Here small function for plotting OHLC data. The time increase significantly if I add indicators or other data.
import numpy as np
import datetime
from matplotlib.collections import LineCollection
from pylab import *
import urllib2
def test_plot(OHLCV):
bar_width = 1.3
date_offset = 0.5
fig = figure(figsize=(50, 20), facecolor='w')
ax = fig.add_subplot(1, 1, 1)
labels = ax.get_xmajorticklabels()
setp(labels, rotation=0)
month = MonthLocator()
day = DayLocator()
timeFmt = DateFormatter('%Y-%m-%d')
colormap = OHLCV[:,1] < OHLCV[:,4]
color = np.zeros(colormap.__len__(), dtype = np.dtype('|S5'))
color[:] = 'red'
color[np.where(colormap)] = 'green'
dates = date2num( OHLCV[:,0])
lines_hl = LineCollection( zip(zip(dates, OHLCV[:,2]), zip(dates, OHLCV[:,3])))
lines_hl.set_color(color)
lines_hl.set_linewidth(bar_width)
lines_op = LineCollection( zip(zip((np.array(dates) - date_offset).tolist(), OHLCV[:,1]), zip((np.array(dates)).tolist(), parsed_table[:,1])))
lines_op.set_color(color)
lines_op.set_linewidth(bar_width)
lines_cl = LineCollection( zip(zip((np.array(dates) + date_offset).tolist(), OHLCV[:,4]), zip((np.array(dates)).tolist(), parsed_table[:,4])))
lines_cl.set_color(color)
lines_cl.set_linewidth(bar_width)
ax.add_collection(lines_hl, autolim=True)
ax.add_collection(lines_cl, autolim=True)
ax.add_collection(lines_op, autolim=True)
ax.xaxis.set_major_locator(month)
ax.xaxis.set_major_formatter(timeFmt)
ax.xaxis.set_minor_locator(day)
ax.autoscale_view()
ax.xaxis.grid(True, 'major')
ax.grid(True)
ax.set_title('EOD test plot')
ax.set_xlabel('Date')
ax.set_ylabel('Price , $')
fig.savefig('test.png', dpi = 50, bbox_inches='tight')
close()
if __name__=='__main__':
data_table = urllib2.urlopen(r"http://ichart.finance.yahoo.com/table.csv?s=IBM&a=00&b=1&c=2012&d=00&e=15&f=2013&g=d&ignore=.csv").readlines()[1:][::-1]
parsed_table = []
#Format: Date, Open, High, Low, Close, Volume
dtype = (lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').date(),float, float, float, float, int)
for row in data_table:
field = row.strip().split(',')[:-1]
data_tmp = [i(j) for i,j in zip(dtype, field)]
parsed_table.append(data_tmp)
parsed_table = np.array(parsed_table)
import time
bf = time.time()
count = 100
for i in xrange(count):
test_plot(parsed_table)
print('Plot time: %s' %(time.time() - bf) / count)
The result is something like this. Average time execution on each plot is aproximately 2.6s. Charting in R is much faster, but I didn't measure the performance and I don't want use Rpy, so I bielive that my code is inefficient.
This solution reuses a Figure instance and saves plots asynchronously. You could change this to have as many figures as there are processors, do that many plots asynchronously, and it should speed things up even more. As it is, this takes ~1s per plot, down from 2.6 on my machine.
import numpy as np
import datetime
import urllib2
import time
import multiprocessing as mp
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from pylab import *
from matplotlib.collections import LineCollection
class AsyncPlotter():
def __init__(self, processes=mp.cpu_count()):
self.manager = mp.Manager()
self.nc = self.manager.Value('i', 0)
self.pids = []
self.processes = processes
def async_plotter(self, nc, fig, filename, processes):
while nc.value >= processes:
time.sleep(0.1)
nc.value += 1
print "Plotting " + filename
fig.savefig(filename)
plt.close(fig)
nc.value -= 1
def save(self, fig, filename):
p = mp.Process(target=self.async_plotter,
args=(self.nc, fig, filename, self.processes))
p.start()
self.pids.append(p)
def join(self):
for p in self.pids:
p.join()
class FinanceChart():
def __init__(self, async_plotter):
self.async_plotter = async_plotter
self.bar_width = 1.3
self.date_offset = 0.5
self.fig = plt.figure(figsize=(50, 20), facecolor='w')
self.ax = self.fig.add_subplot(1, 1, 1)
self.labels = self.ax.get_xmajorticklabels()
setp(self.labels, rotation=0)
line_hl = LineCollection(([[(734881,1), (734882,5), (734883,9), (734889,5)]]))
line_op = LineCollection(([[(734881,1), (734882,5), (734883,9), (734889,5)]]))
line_cl = LineCollection(([[(734881,1), (734882,5), (734883,9), (734889,5)]]))
self.lines_hl = self.ax.add_collection(line_hl, autolim=True)
self.lines_op = self.ax.add_collection(line_cl, autolim=True)
self.lines_cl = self.ax.add_collection(line_op, autolim=True)
self.ax.set_title('EOD test plot')
self.ax.set_xlabel('Date')
self.ax.set_ylabel('Price , $')
month = MonthLocator()
day = DayLocator()
timeFmt = DateFormatter('%Y-%m-%d')
self.ax.xaxis.set_major_locator(month)
self.ax.xaxis.set_major_formatter(timeFmt)
self.ax.xaxis.set_minor_locator(day)
def test_plot(self, OHLCV, i):
colormap = OHLCV[:,1] < OHLCV[:,4]
color = np.zeros(colormap.__len__(), dtype = np.dtype('|S5'))
color[:] = 'red'
color[np.where(colormap)] = 'green'
dates = date2num( OHLCV[:,0])
date_array = np.array(dates)
xmin = min(dates)
xmax = max(dates)
ymin = min(OHLCV[:,1])
ymax = max(OHLCV[:,1])
self.lines_hl.set_segments( zip(zip(dates, OHLCV[:,2]), zip(dates, OHLCV[:,3])))
self.lines_hl.set_color(color)
self.lines_hl.set_linewidth(self.bar_width)
self.lines_op.set_segments( zip(zip((date_array - self.date_offset).tolist(), OHLCV[:,1]), zip(date_array.tolist(), OHLCV[:,1])))
self.lines_op.set_color(color)
self.lines_op.set_linewidth(self.bar_width)
self.lines_cl.set_segments( zip(zip((date_array + self.date_offset).tolist(), OHLCV[:,4]), zip(date_array.tolist(), OHLCV[:,4])))
self.lines_cl.set_color(color)
self.lines_cl.set_linewidth(self.bar_width)
self.ax.set_xlim(xmin,xmax)
self.ax.set_ylim(ymin,ymax)
self.ax.xaxis.grid(True, 'major')
self.ax.grid(True)
self.async_plotter.save(self.fig, '%04i.png'%i)
if __name__=='__main__':
print "Starting"
data_table = urllib2.urlopen(r"http://ichart.finance.yahoo.com/table.csv?s=IBM&a=00&b=1&c=2012&d=00&e=15&f=2013&g=d&ignore=.csv").readlines()[1:][::-1]
parsed_table = []
#Format: Date, Open, High, Low, Close, Volume
dtype = (lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').date(),float, float, float, float, int)
for row in data_table:
field = row.strip().split(',')[:-1]
data_tmp = [i(j) for i,j in zip(dtype, field)]
parsed_table.append(data_tmp)
parsed_table = np.array(parsed_table)
import time
bf = time.time()
count = 10
a = AsyncPlotter()
_chart = FinanceChart(a)
print "Done with startup tasks"
for i in xrange(count):
_chart.test_plot(parsed_table, i)
a.join()
print('Plot time: %.2f' %(float(time.time() - bf) / float(count)))

Categories

Resources