Matplotlib speed up saving plots to disk - python
I want to create an animation from roughly 250 individual frames, showing data plotted as 2D images in a figure with 4 x 11 subpanels. The data represent power spectra of velocity as a function of temporal frequency and latitude. However, each frame takes about 4 seconds to create and save, including run-time computation of the data. In the non-interactive plotting mode, I use 'agg' as the backend to avoid time spent for interactivity plotting features.
The speed bottleneck here is not the computation of the data to plot, but saving the plots to disk. Example run-times for random data (see code below) and only 5 frames without saving the plots are sth. like 5 seconds, with saving the plots 17-19 seconds. For the actual data I use, there are some more plot artists to be drawn (text on panels, an additional line plot etc.), but the script execution time is quite similar. For the about 250 frames in total, this indicates roughly 900 seconds, thus 15 minutes to compute the data and then save the plots. However, since I likely want to generate similar frames several times or with slightly different data, it would be good to decrease this script execution time.
A (hopefully) reproducible code, using random data, but with data sizes equal to the actual data I use, is given below. An example frame (the first one generated by the code) can also be found below. In the code, the function create_fig() generates a figure with subpanels containing dummy data and in the for-loop over the different frames, only the data in the subpanels is replaced.
Is there a way to speed-up saving the plots into the png files? Any help is much appreciated!
# import packages
import numpy as np
import time
import matplotlib as mpl
import matplotlib.pyplot as plt
path_plots_out = '/home/proxauf'
# set up grids
nt, nlat, nlon = 3328, 24, 48
dlat = 7.5
lats = np.linspace(-90,90-dlat,nlat)
dt = 98191.08
nu = (-1) * np.fft.fftfreq(nt, dt) * 10 ** 9
nnu = len(nu)
nu_fftshift = np.fft.fftshift(nu)
dnu_fftshift = nu_fftshift[1] - nu_fftshift[0]
nu_lims = [-500, 500]
ind_nu_xlims = np.where(np.logical_and(nu_fftshift >= nu_lims[0], nu_fftshift <= nu_lims[1]))[0]
ext_box_nu_lat = [nu_fftshift[ind_nu_xlims][0] - dnu_fftshift / 2, nu_fftshift[ind_nu_xlims][-1] + dnu_fftshift / 2, lats[0] - dlat / 2.0, lats[-1] + dlat / 2.0]
nnu_cut = len(ind_nu_xlims)
plt.ioff()
if plt.rcParams['interactive']:
mpl.use('Qt5Agg')
else:
mpl.use('agg')
# plotting function
def create_fig():
data_xlabels = np.zeros((nrows, ncols), dtype='U30')
data_xlabels[-1, :] = r'Frequency [nHz]'
data_xticks = np.array([[np.linspace(-300, 300, 3)] * ncols] * nrows)
data_xticks_minor = np.array([[np.linspace(-500, 500, 21)] * ncols] * nrows)
data_xlims = np.array([[(-500, 500)] * ncols] * nrows)
data_ylabels = np.zeros((nrows, ncols), dtype='U30')
data_ylabels[:, 0] = r'Latitude [deg]'
data_yticks = np.array([[np.linspace(-90, 90, 7)] * ncols] * nrows)
data_yticks_minor = np.array([[np.linspace(-90, 90, 25)] * ncols] * nrows)
data_ylims = np.array([[(-90, 90)] * ncols] * nrows)
plot_xticks = np.zeros((nrows, ncols), dtype=bool)
plot_xticks[-1, :] = True
plot_yticks = np.zeros((nrows, ncols), dtype=bool)
plot_yticks[:, 0] = True
fig_left, fig_right, fig_bottom, fig_top, fig_hspace, fig_wspace = (0.04, 0.95, 0.06, 0.90, 0.1, 0.1)
fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
data_list = []
for i in range(nrows):
data_list_temp = []
for j in range(ncols):
ax = axes[i, j]
im = ax.imshow(np.zeros((nnu_cut, nlat)).T, interpolation='nearest', origin='lower', aspect='auto', cmap='binary', extent=ext_box_nu_lat)
im.set_clim(0,1e4)
ax.set_xlabel(data_xlabels[i, j])
ax.set_ylabel(data_ylabels[i, j])
ax.set_xlim(data_xlims[i, j])
ax.set_ylim(data_ylims[i, j])
ax.set_xticks(data_xticks[i, j])
ax.set_xticks(data_xticks_minor[i, j], minor=True)
ax.set_yticks(data_yticks[i, j])
ax.set_yticks(data_yticks_minor[i, j], minor=True)
if not plot_xticks[i, j]:
ax.tick_params(labelbottom=False)
if not plot_yticks[i, j]:
ax.tick_params(labelleft=False)
data_list_temp.append(im)
data_list.append(data_list_temp)
fig.subplots_adjust(left=fig_left, right=fig_right, bottom=fig_bottom, top=fig_top, hspace=fig_hspace, wspace=fig_wspace)
fig.canvas.draw()
ax1 = axes[0, -1]
ax2 = axes[-1, -1]
top = ax1.get_position().y1
bottom = ax2.get_position().y0
right = ax2.get_position().x1
cbar_pad = 0.01
cbar_width = 0.01
cbar_height = top - bottom
cax = fig.add_axes([right + cbar_pad, bottom, cbar_width, cbar_height])
cbar = plt.colorbar(data_list[-1][-1], ax=axes[-1, -1], cax=cax)
return fig, axes, data_list
nrows = 4
ncols = 11
figsize = (16.5, 8)
# create figure with empty subpanels
fig, axes, data_list = create_fig()
# generate some data
np.random.seed(100)
data1 = np.random.rand(nt,nlat,nlon)
data2 = np.random.rand(nt,nlat,nlon)
data3 = np.random.rand(nt,nlat,nlon)
data4 = np.random.rand(nt,nlat,nlon)
wsize = nt // 4
data1_temp = np.zeros((nt, nlat, nlon))
data2_temp = np.zeros((nt, nlat, nlon))
data3_temp = np.zeros((nt, nlat, nlon))
data4_temp = np.zeros((nt, nlat, nlon))
data1_temp[:wsize,:,:] = data1[:wsize,:,:]
data2_temp[:wsize,:,:] = data2[:wsize,:,:]
data3_temp[:wsize,:,:] = data3[:wsize,:,:]
data4_temp[:wsize,:,:] = data4[:wsize,:,:]
frame_cad = 10
# do not activate, else program will take about 15-20 minutes to finish
# frame_inds = range(0, nt - wsize + 1, frame_cad)
frame_inds = range(0, 50, frame_cad)
t0 = time.time()
for c, i in enumerate(frame_inds):
print(c)
if i >= 1:
# fill in data for the next frame
data1_temp[i-frame_cad:i] = 0.0
data1_temp[i+wsize- 1:i+wsize-1+frame_cad] = data1[i+wsize-1:i+wsize-1+frame_cad,:,:]
data2_temp[i-frame_cad:i] = 0.0
data2_temp[i+wsize- 1:i+wsize-1+frame_cad] = data2[i+wsize-1:i+wsize-1+frame_cad,:,:]
data3_temp[i-frame_cad:i] = 0.0
data3_temp[i+wsize- 1:i+wsize-1+frame_cad] = data3[i+wsize-1:i+wsize-1+frame_cad,:,:]
data4_temp[i-frame_cad:i] = 0.0
data4_temp[i+wsize- 1:i+wsize-1+frame_cad] = data4[i+wsize-1:i+wsize-1+frame_cad,:,:]
# compute power spectrum
pu1_temp = np.abs(np.fft.fftn(data1_temp, axes=(0, 2))) ** 2
pu2_temp = np.abs(np.fft.fftn(data2_temp, axes=(0, 2))) ** 2
pu3_temp = np.abs(np.fft.fftn(data3_temp, axes=(0, 2))) ** 2
pu4_temp = np.abs(np.fft.fftn(data4_temp, axes=(0, 2))) ** 2
pu_temp_list = [pu1_temp, pu2_temp, pu3_temp, pu4_temp]
# update data in subpanels
for s in range(nrows):
for j in range(ncols):
data_list[s][j].set_data(np.fft.fftshift(pu_temp_list[s][:,:,j], axes=(0,))[ind_nu_xlims].T)
# save figure
fig.savefig('%s/stackoverflow_test/frame_%04d.png' % (path_plots_out, c))
plt.close()
print(time.time() - t0)
Update: Modified code blocks given below (no minor ticks, pyfftw instead of numpy, faster absolute-square computation; note: data_list return argument from create_fig() renamed to plot_data_list) yield running times of about 6s for 5 frames. The biggest speed boost comes from deactivating minor ticks (as mentioned in Jody Klymak's answer).
# use np.take_along_axis() with sorting indices instead of np.fft.fftshift() later, gives a slight (not too much!) speed boost
ind_nu_xlims = np.where(np.logical_and(nu >= nu_lims[0], nu <= nu_lims[1]))[0]
ind_nu_sort = np.argsort(nu[ind_nu_xlims])
nu_sort = np.take_along_axis(nu[ind_nu_xlims],ind_nu_sort,axis=0)
ext_box_nu_lat = [nu_sort[0] + dnu_fftshift / 2, nu_sort[-1] - dnu_fftshift / 2, lats[0] - dlat / 2.0, lats[-1] + dlat / 2.0]
# plotting function
def create_fig():
# deactivating ticks massively (!) boosts plotting performance
# ax.set_xticks(data_xticks_minor[i, j], minor=True)
# ax.set_yticks(data_yticks_minor[i, j], minor=True)
data_list = [data1, data2, data3, data4]
# wisdom makes FFTs much faster using pyfftw than using numpy
# enable cache and set cache memory-keeping time sufficiently large
# this depends on the computation time between FFT calls
pyfftw.interfaces.cache.enable()
pyfftw.interfaces.cache.set_keepalive_time(5)
for c, i in enumerate(frame_inds):
print(c)
data_temp_list = [data1_temp, data2_temp, data3_temp, data4_temp]
pu_temp_list = []
for j, data_temp in enumerate(data_temp_list):
if i >= 1:
# fill in data for the next frame
data_temp[i-frame_cad:i] = 0.0
data_temp[i+wsize-1:i+wsize-1+frame_cad] = data_list[j][i+wsize-1:i+wsize-1+frame_cad,:,:]
# compute Fourier transform via pyfftw; wisdom makes FFTs much faster using pyfftw than using numpy
pu_temp = pyfftw.interfaces.numpy_fft.fftn(data_temp, axes=(0, 2), threads=-1)
# compute absolute-square using np.real(x * np.conj(x));
# about same speed as np.real(x) * np.imag(x);
# faster than np.einsum('ijk,ijk->ijk',x,np.conj(x));
# also faster than np.abs(x)**2 since np.abs(x)**2 first takes square-root, then squares again
pu_temp = np.real(pu_temp*np.conj(pu_temp))
pu_temp_list.append(pu_temp)
# update data in subpanels
for s in range(nrows):
for j in range(ncols):
# use np.take_along_axis() with sorting indices instead of np.fft.fftshift(), gives a slight (not too much!) speed boost
plot_data_list[s][j].set_data(np.take_along_axis(pu_temp_list[s][ind_nu_xlims,:,j], ind_nu_sort[:,None], axis=0).T)
# save figure
fig.savefig('%s/stackoverflow_test/frame_%04d.png' % (path_plots_out, c))
plt.close()
print(time.time() - t0)
So if that is exactly what you want the plot to look like, then I think you are doing the fastest that you can do. I get 15 s for 5 figures, and get 5 s for not saving.
Believe it or not, the easy way to make it faster is to drop your minor ticks. If I comment those lines out I get 8 s, for a 70% speed up. Ticks are really expensive in matplotlib. Given your minor ticks are tiny, I'd suggest that as an easy optimization.
I will give you some tips, but can be not a solution:
You are doing the rigth thing to run over the matrix, but check if can maximize the cache transposing your matrix (when you have a very tall and narrow case)
Have your heard about of sparse-matrix or matrix compressing techniques?
do the stuff that you need to do when i<1 outside of the for loop - you will save 1 comparison if you take out that
can you use parallel computation? like Omp for python?
Related
scipy.integrate.solve_ivp diverges on a state space simulation well finished in MATLAB
I tried to simulate a state space model with MATLAB ode45, then I tried the same work in Python with scipy.integrate.solve_ivp. As it is obviously shown in this post pictures. Python simulation diverges for no good reason. The solvers message is "Required step size is less than spacing between numbers." but adding time steps is not a solution. Here is the MATLAB code for the time interval of half a second following a link for the plot of 173rd state: [1]: https://i.stack.imgur.com/mMdNQ.png C_static=csvread('C_static.csv'); M_static=csvread('M_static.csv'); B_static=csvread('B_static.csv'); CY_static=csvread('CY_static.csv'); DY_static=csvread('DY_static.csv'); dynamoterm = csvread('dynamoterm.csv'); C_static=0*C_static; n2panto=dynamoterm(6,1); n2cw=dynamoterm(6,2); k_dynamic = KCdyna(0,dynamoterm); K_total = K_static; K_total(n2cw+1:n2panto+1,n2cw+1:n2panto+1)=K_total(n2cw+1:n2panto+1,n2cw+1:n2panto+1)+k_dynamic; A_static = [0*K_static, eye(length(B_static)); -M_static\K_static, -M_static\C_static]; Bu = [0*B_static; M_static\B_static]; inc0 = -A_static\Bu; M_in=inv(M_static); M_cwp=M_in(:,n2cw+1:n2panto+1); timer=tic; [T, Y] = ode45(#(t,X) Asol(X,t,A_static,M_cwp,Bu,n2cw,n2panto,dynamoterm),[0,0.5],inc0); output=[CY_static,0*CY_static]*Y'+DY_static*ones(1,length(T)); figure plot(T,output(173,:)); stopwatch=toc(timer); function dx=Asol(X,t,A_static,M_cwp,Bu,n2cw,n2panto,dynamoterm) [k_dynamic]=KCdyna(t,dynamoterm); A=A_static; A(n2panto+4:2*(n2panto+3),n2cw+1:n2panto+1)=A_static(n2panto+4:2*(n2panto+3),n2cw+1:n2panto+1)-M_cwp*k_dynamic; dx=A*X+Bu; end [![MATLAB simulation plot of the 173rd state][1]][1] Here is my similar work in Python for the time interval of half a second following a link for the plot of 173rd state: [2]: https://i.stack.imgur.com/LOg2j.png from KCdyna2 import K_dyn import matplotlib.pyplot as plt from scipy.integrate import solve_ivp # Imports matrices via .csv file M = np.genfromtxt('Excel\dyn\M_static.csv', delimiter=',') C = np.genfromtxt('Excel\dyn\C_static.csv', delimiter=',') C = np.zeros(np.shape(C)) K_static = np.genfromtxt('Excel\dyn\K_static.csv', delimiter=',') B = np.genfromtxt('Excel\dyn\B_static.csv', delimiter=',') dyn_trm = np.genfromtxt('Excel\dyn\dynamoterm.csv', delimiter=',') # Slice addresses n2cw = int(dyn_trm[5, 1]) # Slice beginning n2panto = int(dyn_trm[5, 0]) # Slice finishing # Time interval for solution time_interval = [0, 0.5] times = np.linspace(time_interval[0], time_interval[1], 50000) M_inv = np.linalg.inv(M) K_total = K_static K_total[n2cw:n2panto + 1, n2cw:n2panto + 1] += K_dyn(0, dyn_trm) # System dynamics matrix A_static = np.block([[np.zeros((len(M_inv), len(M_inv)), dtype='uint8'), np.eye(len(M_inv), dtype='uint8')], [np.matmul(-M_inv, K_total), np.matmul(-M_inv, C)]]) Bu = np.block([[np.zeros((len(B), 1), dtype='uint8')], [np.matmul(M_inv, B).reshape(len(B), 1)]]) inc0 = np.matmul(-np.linalg.inv(A_static), Bu) M_inv = np.linalg.inv(M) M_cwp = M_inv[:, n2cw:n2panto + 1] def SttSpcEq(t, x, M_cwp, A_st, Bu, dynamoterm, n2cw,n2panto): K_dynamic = K_dyn(t, dynamoterm) A = A_st A[n2panto + 3:2*(n2panto+3), n2cw:n2panto + 1] -= np.matmul(M_cwp, K_dynamic) return (np.matmul(A, x.reshape(len(Bu), 1)) + Bu).reshape(len(Bu), ) soln = solve_ivp(SttSpcEq, time_interval, inc0.reshape(len(inc0),), method='RK45', t_eval=times, args=(M_cwp, A_static, Bu, dyn_trm, n2cw, n2panto)) print(soln.message) plt.plot(soln.t, soln.y[172]) plt.show() ``` [![Python simulation plot of the 173rd state][2]][2]
Why does my functions seem to integrate and not differentiate? (pywt.cwt)
I am really confused by the function pywt.cwt, as I've not been able to get it to work. The function seems to integrate instead of differentiating. I would like to work it as the following: Example CWT, but my graph looks like this: My CWT. The idea is to integrate the raw signal (av) with cumtrapz, then differentiate with a gaussian CWT (=> S1), and then once more differentiate with gaussian CWT (=> S2). As you can see in the pictures, the bottom peaks of the red line should line up in the valleys, but the land under the top peaks for me, and the green line should move 1/4th period to the left but moves to the right... Which makes me think it integrates for some reason. I currently have no idea what causes this... Does anyone happen to know what is going on? Thanks in advance! #Get data from pandas av = dfRange['y'] #remove gravity & turns av right way up av = av - dfRange['y'].mean() av = av * -1 #Filter [b,a] = signal.butter(4, [0.9/(55.2/2), 20/(55.2/2)], 'bandpass') av = signal.filtfilt(b,a, av) #Integrate and differentiate av => S1 integrated_av = integrate.cumtrapz(av) [CWT_av1, frequency1] = pywt.cwt(integrated_av, 8.8 , 'gaus1', 1/55.2) CWT_av1 = CWT_av1[0] CWT_av1 = CWT_av1 * 0.05 #differentiate S1 => S2 [CWT_av2, frequency2] = pywt.cwt(CWT_av1, 8.8 , 'gaus1', 1/55.2) CWT_av2 = CWT_av2[0] CWT_av2 = CWT_av2 * 0.8 #Find Peaks inv_CWT_av1 = CWT_av1 * -1 av1_min, _ = signal.find_peaks(inv_CWT_av1) av2_max, _ = signal.find_peaks(CWT_av2) #Plot plt.style.use('seaborn') plt.figure(figsize=(25, 7), dpi = 300) plt.plot_date(dfRange['recorded_naive'], av, linestyle = 'solid', marker = None, color = 'steelblue') plt.plot_date(dfRange['recorded_naive'][:-1], CWT_av1[:], linestyle = 'solid', marker = None, color = 'red') plt.plot(dfRange['recorded_naive'].iloc[av1_min], CWT_av1[av1_min], "ob", color = 'red') plt.plot_date(dfRange['recorded_naive'][:-1], CWT_av2[:], linestyle = 'solid', marker = None, color = 'green') plt.plot(dfRange['recorded_naive'].iloc[av2_max], CWT_av2[av2_max], "ob", color = 'green') plt.gcf().autofmt_xdate() plt.show()
I'm not sure this is your answer, but an observation from playing with pywt... From the documentation the wavelets are basically given by the differentials of a Gaussian but there is an order dependent normalisation constant. Plotting the differentials of a Guassian against the wavelets (extracted by putting in an impulse response) gives the following: The interesting observation is that the order dependent normalisation constant sometimes seems to include a '-1'. In particular, it does for the first order gaus1. So, my question is, could you actually have differentiation as you expect, but also multiplication by -1? Code for the graph: import numpy as np import matplotlib.pyplot as plt import pywt dt = 0.01 t = dt * np.arange(100) # Calculate the differentials of a gaussian by quadrature: # start with the gaussian y = exp(-(x - x_0) ^ 2 / dt) ctr = t[len(t) // 2] gaus = np.exp(-np.power(t - ctr, 2)/dt) gaus_quad = [np.gradient(gaus, dt)] for i in range(7): gaus_quad.append(np.gradient(gaus_quad[-1], dt)) # Extract the wavelets using the impulse half way through the dataset y = np.zeros(len(t)) y[len(t) // 2] = 1 gaus_cwt = list() for i in range(1, 9): cwt, cwt_f = pywt.cwt(y, 10, f'gaus{i}', dt) gaus_cwt.append(cwt[0]) fig, axs = plt.subplots(4, 2) for i, ax in enumerate(axs.flatten()): ax.plot(t, gaus_cwt[i] / np.max(np.abs(gaus_cwt[i]))) ax.plot(t, gaus_quad[i] / np.max(np.abs(gaus_quad[i]))) ax.set_title(f'gaus {i+1}', x=0.2, y=1.0, pad=-14) ax.axhline(0, c='k') ax.set_xticks([]) ax.set_yticks([])
Inaccurate values of x-axis in plot
I am trying to plot a specific course (acceleration over time) using matplotlib. The plot works so far and is being shown (see image). J equals 35 and represents the derivative of acceleration over time (which in this case is a constant). import numpy as np import matplotlib.pyplot as plt def limits_acc_course(): limits_acc_course.t1 = 0.14285714285714285 limits_acc_course.t2 = 0.14285714285714285 + 0.10714285714285715 limits_acc_course.t3 = 2*0.14285714285714285 + 0.10714285714285715 limits_acc_course.t4 = 2*0.14285714285714285 + 0.10714285714285715 + 0.5*0.24714285714285716 limits_acc_course() t_end = 2*limits_acc_course.t4 t_1 = np.linspace(0, limits_acc_course.t1) t_2 = np.linspace(limits_acc_course.t1, limits_acc_course.t2) t_3 = np.linspace(limits_acc_course.t2, limits_acc_course.t3) t_4 = np.linspace(limits_acc_course.t3, limits_acc_course.t4) tk1 = np.array([]) tk2 = np.array([]) tk3 = np.array([]) tk4 = np.array([]) for value1 in t_1: tk1 = np.append(tk1, value1*j) for value2 in t_2: tk2 = np.append(tk2, limits_acc_course.t1*j) for value3 in t_3: tk3 = np.append(tk3, (limits_acc_course.t3-value3)*j) for value4 in t_4: tk4 = np.append(tk4, value4*0) if value4 == (2*limits_acc_course.t4-limits_acc_course.t3)*j: break t = np.concatenate((tk1, tk2, tk3, tk4), axis=0) t_neg = (-1)*np.concatenate((tk1, tk2, tk3), axis=0) t_final = np.concatenate((t, t_neg), axis=0) t_range = np.linspace(0, t_end, t_final.size) fig, t = plt.subplots() t.plot(t_range, t_final) t.get_xaxis().get_major_formatter().set_useOffset(False) plt.show() The problem is that the x-coordinates in plot do not match the calculated values. The x-values in the plot (see image)) should be: 0.142857142857 0.25 (Or at least with such an accuracy:0.1429) The x-values in the plot are. 0.144777 0.295348 I have tried to turn off the offset and i have played with range from 100 to 2500 values for each part and I have tried to round the values but it didn't work either. Further I have tried using endpoint=False in creating the ranges t_1 to t_4. By now I ran out of ideas. enter image description here
The plot is created in an axes which will extent over ~500 pixels on screen. The x axis shows 1.1 units. Hence you have 1.1/500 = 0.0022 units per pixel. The mouse cursor cannot know its position more accurate than 1 pixel. Hence the coordiante shown by the mouse cursor is accurate to ~±0.0022 units. The observed coordinate (0.144777) deviates from the actual coordinate (0.142857142857) by 0.0019 units, which is well within the accuracy of the cursor.
Panda dataframe column cut - add more bins more frequently around the mean
I am categorizing quantitative variable (e.g. price) and I would like to categorize it in the manner that the bins would be much more frequent around the mean and less when away from the mean. I have seen that there are possibilities to cut() in linear manner and thanks to numpy.logspace in logarithmic manner, but binning around the mean seems to be void and my ideas so far haven't worked and seem to be inefficient.
You can make bins that increase in size linearly: import numpy as np def make_progressive_bins(min_x, max_x, mean_x, num_bins=10): x_rel_lim = max(mean_x - min_x, mean_x - max_x) num_bins_half = num_bins // 2 bins_right = np.arange(0, num_bins_half + 1) if num_bins % 2 == 1: bins_right = bins_right + 0.5 bins_right = np.cumsum(bins_right) bins = np.concatenate([-bins_right[bins_right > 0][::-1], bins_right]) bins = bins * (float(x_rel_lim) / bins[-1]) + mean_x return bins And then you can use it like: import numpy as np import matplotlib.pyplot as plt bins = make_progressive_bins(-20, 50, 10, 15) plt.bar(bins - 0.1, np.ones_like(bins), 0.2)
I made a script that might do what you want to achieve, but I'm not sure how to convert the resulted cut object into a histogram to see if it does what i want it to do, so please check and tell me if it works :). # Make normally distributed price with mean 50. df = pd.DataFrame(data=np.random.normal(50, size=1000), columns=['price']) df.hist(bins=30) num_bins = 100 # I used a square function to distribute the bins more around 0 and # less at the outskirts of the range. shape_func = lambda x: x**2 bin_loc = [shape_func(i) for i in range(num_bins//2)] mirrored_bin_loc = [-x for x in bin_loc[::-1]] bin_loc = mirrored_bin_loc + bin_loc[1:] # Rescale and translate bins data_mean = df.price.mean() data_range = df.price.max() - df.price.min() final_bin_loc = [(x + data_mean) / (data_range * num_bins) for x in bin_loc] # display(final_bin_loc) binned = pd.cut(df.price, bin_loc)
RINEX plotting in Python
I’m trying to plot data an in order to check my code, I’m making a comparison of the resulting plots with what has already been generated with Matlab. I am encountering several issues however with this: Generally, the parsing of RINEX files works, and the general pattern of the presentation of the data looks similar to that the Matlab scripts plotted. However there are small deviations in data that should become apparent when zooming in on the data i.e. when using a smaller time series, for example plotting over a special 2 hour period, not 24 hours. In Matlab, this small discrepancy can be seen, and a polynomial fitting applied. However for the Python plots (the first plot shown below), the curved line of this two hour period appears “smooth” and does not deviate at all, like that seen in the Matlab script (the second plot shows the blue line as the data, against the red line of the polyfit, hence, the blue line shows a slight discrepancy at x=9.4). The Matlab script is assumed correct, as this deviation is because of an Seismic activity that disrupts the ionosphere temporarily. Please refer to the plots below: The third plot is in Matlab, where this is simply the polyfit minus the live data. Therefore, it is not clear just how this data is being plotted on the axes for the Python script, because the data appears to smooth? Nor if my code is wrong (see below) and somehow “smooths” out the data somehow: #Calculating by looping through for sv in range(32): sat = self.obs_data_chunks_dataframe[sv, :] #print "sat.index_{0}: {1}".format(sv+1, sat.index) phi1 = sat['L1'] * LAMBDA_1 #Change units of L1 to meters phi2 = sat['L2'] * LAMBDA_2 #Change units of L2 to meters pr1 = sat['P1'] pr2 = sat['P2'] #CALCULATION: teqc Calculation iono_teqc = COEFF * (pr2 - pr1) / 1000000 #divide to make values smaller (tbc) print "iono_teqc_{0}: {1}".format(sv+1, iono_teqc) #PLOTTING #Plotting of the data plt.plot(sat.index, iono_teqc, label=‘teqc’) plt.xlabel('Time (UTC)') plt.ylabel('Ionosphere Delay (meters)') plt.title("Ionosphere Delay on {0} for Satellite {1}.".format(self.date, sv+1)) plt.legend() ax = plt.gca() ax.ticklabel_format(useOffset=False) plt.grid() if sys.platform.startswith('win'): plt.savefig(winpath + '\Figure_SV{0}'.format(sv+1)) elif sys.platform.startswith('darwin'): plt.savefig(macpath + 'Figure_SV{0}'.format(sv+1)) plt.close() Following on from point 1, the polynomial fitting code below does not run the way I’d like, so I’m overlooking something here. I assume this has to do with the data used upon the x,y-axes but can’t pinpoint exactly what. Would anyone know where I am going wrong here? #Zoomed in plots if sv == 19: #Plotting of the data plt.plot(sat.index, iono_teqc, label=‘teqc’) #sat.index to plot for time in UTC plt.xlim(8, 10) plt.xlabel('Time (UTC)') plt.ylabel('Ionosphere Delay (meters)') plt.title("Ionosphere Delay on {0} for Satellite {1}.".format(self.date, sv+1)) plt.legend() ax = plt.gca() ax.ticklabel_format(useOffset=False) plt.grid() #Polynomial fitting coefficients = np.polyfit(sat.index, iono_teqc, 2) plt.plot(coefficients) if sys.platform.startswith('win'): #os.path.join(winpath, 'Figure_SV{0}'.format(sv+1)) plt.savefig(winpath + '\Zoom_SV{0}'.format(sv+1)) elif sys.platform.startswith('darwin'): plt.savefig(macpath + 'Zoom_SV{0}'.format(sv+1)) plt.close() My RINEX file comprises 32 satellites. However when trying to generate the plots for all 32, I receive: IndexError: index 31 is out of bounds for axis 0 with size 31 Changing the code below to 31 solves this partly, only excluding the 32nd satellite. I’d like to also plot for satellite 32. The functions for the parsing, and formatting of the data are given below: def read_obs(self, RINEXfile, n_sat, sat_map): obs = np.empty((TOTAL_SATS, len(self.obs_types)), dtype=np.float64) * np.NaN lli = np.zeros((TOTAL_SATS, len(self.obs_types)), dtype=np.uint8) signal_strength = np.zeros((TOTAL_SATS, len(self.obs_types)), dtype=np.uint8) for i in range(n_sat): # Join together observations for a single satellite if split across lines. obs_line = ''.join(padline(RINEXfile.readline()[:-1], 16) for _ in range((len(self.obs_types) + 4) / 5)) #obs_line = ''.join(padline(RINEXfile.readline()[:-1], 16) for _ in range(2)) #while obs_line for j in range(len(self.obs_types)): obs_record = obs_line[16*j:16*(j+1)] obs[sat_map[i], j] = floatornan(obs_record[0:14]) lli[sat_map[i], j] = digitorzero(obs_record[14:15]) signal_strength[sat_map[i], j] = digitorzero(obs_record[15:16]) return obs, lli, signal_strength def read_data_chunk(self, RINEXfile, CHUNK_SIZE = 10000): obss = np.empty((CHUNK_SIZE, TOTAL_SATS, len(self.obs_types)), dtype=np.float64) * np.NaN llis = np.zeros((CHUNK_SIZE, TOTAL_SATS, len(self.obs_types)), dtype=np.uint8) signal_strengths = np.zeros((CHUNK_SIZE, TOTAL_SATS, len(self.obs_types)), dtype=np.uint8) epochs = np.zeros(CHUNK_SIZE, dtype='datetime64[us]') flags = np.zeros(CHUNK_SIZE, dtype=np.uint8) i = 0 #ggfrfg while True: hdr = self.read_epoch_header(RINEXfile) if hdr is None: break epoch_time, flags[i], sats = hdr #epochs[i] = np.datetime64(epoch_time) epochs[i] = epoch_time sat_map = np.ones(len(sats)) * -1 for n, sat in enumerate(sats): if sat[0] == 'G': sat_map[n] = int(sat[1:]) - 1 obss[i], llis[i], signal_strengths[i] = self.read_obs(RINEXfile, len(sats), sat_map) i += 1 if i >= CHUNK_SIZE: break return obss[:i], llis[:i], signal_strengths[:i], epochs[:i], flags[:i] def read_data(self, RINEXfile): obs_data_chunks = [] while True: obss, _, _, epochs, _ = self.read_data_chunk(RINEXfile) epochs = epochs.astype(np.int64) epochs = np.divide(epochs, float(3600.000)) if obss.shape[0] == 0: break obs_data_chunks.append(pd.Panel( np.rollaxis(obss, 1, 0), items=['G%02d' % d for d in range(1, 33)], major_axis=epochs, minor_axis=self.obs_types ).dropna(axis=0, how='all').dropna(axis=2, how='all')) self.obs_data_chunks_dataframe = obs_data_chunks[0] Any suggestions? Cheers, pymat.
I managed to solve Qu1 as it was a conversion issue with my calculation that was overlooked, the other two points are however open...