Very high memory usage with simple Python loop - python

I have the following code, which reads in a set of (small) observations, runs a cross-correlation calculation on them, and then saves some plots:
import matplotlib.pyplot as plt
import numpy as np
import astropy.units as u
from sunkit_image.time_lag import cross_correlation, get_lags, max_cross_correlation, time_lag
time=np.linspace(0,43200,num=int(43200/12))
timeu = time * u.s
for i in range(len(folders)): # loop over all dates
os.chdir('/Volumes/LaCie/timelags/RARs/'+folders[i])
print(folders[i])
for j in range(len(pairs)): # iterates over every pair of data sets
for x in range(36): # sets up a sliding 2-hour window that shifts 20 min at a time
ch_a = np.load('dc'+pairs[j][0]+'.npy',allow_pickle=True)[()][100*x:(100*x)+600,:,:] # read in only necessary data (but entire file is only ~6 Gb)
ch_b = np.load('dc'+pairs[j][1]+'.npy',allow_pickle=True)[()][100*x:(100*x)+600,:,:] # read in only necessary data (but entire file is only ~6 Gb)
ctime= timeu[100*x:(100*x)+600] # sets up the correct time array
print('ctime range:',ctime[0],ctime[-1],len(ctime))
max_cc_map = max_cross_correlation(ch_a, ch_b, ctime)
tl_map = time_lag(ch_a, ch_b, ctime)
del ch_a # trying to deal with memory issue
del ch_b # trying to deal with memory issue
plt.close('all') # making sure I don't just create endless open plots
fig = plt.figure()
ax = fig.add_subplot()
im = ax.imshow(np.flip(tl_map,axis=0), cmap="cubehelix", vmin=-6000, vmax=6000)
cax = make_axes_locatable(ax).append_axes("right", size="5%", pad="10%")
fig.colorbar(im, cax=cax,label=r"$\tau_{AB}$ [s]")
plt.tight_layout()
fig.savefig('timelag_'+pairs[j][0]+'_'+pairs[j][1]+'_'+str(x)+'.png',dpi=400)
fig = plt.figure()
ax = fig.add_subplot()
im = ax.imshow(np.flip(max_cc_map,axis=0), cmap="plasma",vmin=0,vmax=1)
cax = make_axes_locatable(ax).append_axes("right", size="5%", pad="10%")
fig.colorbar(im, cax=cax,label=r"Max Cross-correlation")
plt.tight_layout()
fig.savefig('maxcc_'+pairs[j][0]+'_'+pairs[j][1]+'_'+str(x)+'.png',dpi=400)
fig=plt.figure(figsize=(10,6))
values_tl, bins_tl, bars = plt.hist(np.ravel(np.asarray(tl_map)),bins=np.arange(-6000,6000,12000/50),log=True,label='Time Lags')
values_masked, bins_masked, bars = plt.hist(np.ravel(np.asarray(tl_map)[np.where(np.asarray(max_cc_map) > 0.25)])
,bins=np.arange(-6000,6000,12000/50),log=True,label='Masked CC > 0.25')
values_masked2, bins_masked2, bars = plt.hist(np.ravel(np.asarray(tl_map)[np.where(np.asarray(max_cc_map) > 0.5)])
,bins=np.arange(-6000,6000,12000/50),log=True,label='Masked CC > 0.5')
values_masked3, bins_masked3, bars = plt.hist(np.ravel(np.asarray(tl_map)[np.where(np.asarray(max_cc_map) > 0.75)])
,bins=np.arange(-6000,6000,12000/50),log=True,label='Masked CC > 0.75')
plt.ylabel('Pixel Occurrence')
plt.legend()
fig.savefig('hist_tl_cc_'+pairs[j][0]+'_'+pairs[j][1]+'_'+str(x)+'.png',dpi=400)
As noted in the comments, I've inserted a few lines to try to dump unnecessary data between iterations; I know a 3-deep for loop isn't the most efficient way to code, but the loop over the dates and channel pairs are very short -- almost all of the time/memory is spent in the innermost loop. The problem is that after a few minutes, the memory usage is oscillating between 30-55 GB. My Mac is becoming sluggish, and it's only at the beginning of the dataset. Is there something I'm missing here? Even if the entire files were being read in at the beginning instead of a subset, it's only ~ 12 Gb of data, and the code would crash if I was reading in the whole thing (i.e., it's definitely only reading in part of the raw data). I tried a with statement but that didn't take up less memory. Any suggestions would be very welcome!

Per loop you create 3 figures but you never close them. After each fig.savefig(...), you should close the figure with plt.close(fig).

Related

How do I find, plot, and output the peaks of a live plotted Fast Fourier Transform (FFT) in Python?

I am working with the pyaudio and matplotlib packages for the first time and I am attempting to plot live audio data from microphone input, transform it to frequency domain information, and then output peaks with an input distance. This project is a modification of the three-part guide to build a spectrum analyzer found here.
Currently the code is formatted in a class as I have alternative methods that I am applying to the audio but I am only posting the class with the relevant methods as they don't make reference to each and are self-contained. Another quirk of the program is that it calls upon a local file though it only uses input from the user microphone; this is a leftover from the original functionality of plotting a sound file's intensity while it played and is no longer integral to the code.
import pyaudio
import wave
import struct
import pandas as pd
from scipy.fftpack import fft
from scipy.signal import find_peaks
import matplotlib.pyplot as plt
import numpy as np
class Wave:
def __init__(self, file) -> None:
self.CHUNK = 1024 * 4
self.obj = wave.open(file, "r")
self.callback_output = []
self.data = self.obj.readframes(self.CHUNK)
self.rate = 44100
# Initiate an instance of PyAudio
self.p = pyaudio.PyAudio()
# Open a stream with the file specifications
self.stream = self.p.open(format = pyaudio.paInt16,
channels = self.obj.getnchannels(),
rate = self.rate,
output = True,
input = True,
frames_per_buffer = self.CHUNK)
def fft_plot(self, distance: float):
x_fft = np.linspace(0, self.rate, self.CHUNK)
fig, ax = plt.subplots()
line_fft, = ax.semilogx(x_fft, np.random.rand(self.CHUNK), "-", lw = 2)
# Bind plot window sizes
ax.set_xlim(20, self.rate / 2)
plot_data = self.stream.read(self.CHUNK)
self.data_int = pd.DataFrame(struct.unpack(\
str(self.CHUNK * 2) + 'h', plot_data)).astype(dtype = "b")[::2]
y_fft = fft(self.data_int)
line_fft.set_ydata(np.abs(y_fft[0:self.CHUNK]) / (256 * self.CHUNK))
plt.show(block = False)
while True:
# Read incoming audio data
data = self.stream.read(self.CHUNK)
# Convert data to bits then to array
self.data_int = struct.unpack(str(4 * self.CHUNK) + 'B', data)
# Recompute FFT and update line
yf = fft(self.data_int)
line_data = np.abs(yf[0:self.CHUNK]) / (128 * self.CHUNK)
line_fft.set_ydata(line_data)
# Find all values above threshold
peaks, _ = find_peaks(line_data, distance = distance)
# Update the plot
plt.plot(peaks, line_data[peaks], "x")
fig.canvas.draw()
fig.canvas.flush_events()
# Exit program when plot window is closed
fig.canvas.mpl_connect('close_event', exit)
test_file = "C:/Users/Tam/Documents/VScode/Final Project/PrismGuitars.wav"
audio_test = Wave(test_file)
audio_test.fft_plot(2000)
The code does not throw any errors and runs fine with an okay framerate and only terminates when the plot window is closed, all of which is good. The issue I'm encountering is with the determination and plotting of the peaks of line_data as when I run this code the output over time looks like this matplotlib graph instance.
It seems that the peaks (or peak) are being found but at a lower frequency than the x of line_data and as such are shifted comparatively. The other, more minor, issue is that since this is a live plot I would like to clear the previous instance of the peak marker so that it only shows the current instance and not all of the ones plotted prior.
I have attempted in prior fixes to use the line_fft in the peak detection but as it is cast to a Line2D format the peak detection algorithm isn't able to deal with the data type. I have also tried implementing a list comprehension as seen in this post but the time to cast to list is prohibitively slow and did not return any peak markers when I ran it.
EDIT: Following Jody's input the program now returns the proper values as I was only printing an index for the x-coordinate of the peak marker. Nevertheless I would still appreciate some insight as to whether it is possible to update per marker rather than having all the previous ones constantly displayed.
As for the marker updating I have attempted to clear the plot in the while loop both before and after drawing the markers (in different tests of course) but I only ever end up with a completely blank graph.
Please let me know if there is anything I should clarify and thank you for your time.
As Jody pointed out the peaks variable contains indexes for the detected peaks that then need to be retrieved from x_fft and line_data in order to match up with the displayed data.
First we create a scatter plot:
scat = ax.scatter([], [], c = "purple", marker = "x")
This data can then be stacked using a container variable in the while loop as such:
array_peaks = np.c_[x_fft[peaks], line_data[peaks]]
and update the data in the while loop with:
scat.set_offsets(array_peaks)

Creating moving images in python

I am wondering what's the best approach to turn a large number of images into a moving one in Python. A lot of examples I've found seem to deal with actual videos or video games, such as pygame, which seems over complicated for what I'm looking to do.
I have created a loop, and would like the image to update every time the code runs through the loop. Is there a possibly a method in python to overplot each image and erase the previous image with each iteration?
sweeps_no = 10
for t in range(sweeps_no):
i = np.random.randint(N)
j = np.random.randint(N)
arr = nearestneighbours(lat, N, i, j)
energy = delta_E(lat[i,j], arr, J)
if energy <= 0:
matrix[i,j] *= matrix[i,j]
elif np.exp(energy/T) >= np.random.random():
matrix[i,j] *= -matrix[i,j]
else:
matrix[i,j] = matrix[i,j]
t +=1
print t
res.append(switch)
image = plt.imshow(lat)
plt.show()
Also, I can't understand why the loop above doesn't result in 10 different images showing up when the image is contained in the loop.
You can update a single figure using fig.canvas.draw() after your call to imshow(). It is important to include a pause i.e. plt.pause(2), so that you can see the changes to your figure.
The following is a runnable example:
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure() # create the figure
for i in range(10):
data = np.random.randn(25).reshape(5,5) # some fake data
plt.imshow(data)
fig.canvas.draw()
plt.pause(2) # pause for 2 seconds

Matplotlib plot excessively slow

I'm trying to plot 20 million data points however it's taking an extremely long time (over an hour) using matplotlib,
Is there something in my code that is making this unusually slow?
import csv
import matplotlib.pyplot as plt
import numpy as np
import Tkinter
from Tkinter import *
import tkSimpleDialog
from tkFileDialog import askopenfilename
plt.clf()
root = Tk()
root.withdraw()
listofparts = askopenfilename() # asks user to select file
root.destroy()
my_list1 = []
my_list2 = []
k = 0
csv_file = open(listofparts, 'rb')
for line in open(listofparts, 'rb'):
current_part1 = line.split(',')[0]
current_part2 = line.split(',')[1]
k = k + 1
if k >= 2: # skips the first line
my_list1.append(current_part1)
my_list2.append(current_part2)
csv_file.close()
plt.plot(my_list1 * 10, 'r')
plt.plot(my_list2 * 10, 'g')
plt.show()
plt.close()
There is no reason whatsoever to have a line plot of 20000000 points in matplotlib.
Let's consider printing first:
The maximum figure size in matplotlib is 50 inch. Even having a high-tech plotter with 3600 dpi would give a maximum number of 50*3600 = 180000 points which are resolvable.
For screen applications it's even less: Even a high-tech 4k screen has a limited resolution of 4000 pixels. Even if one uses aliasing effects, there are a maximum of ~3 points per pixel that would still be distinguishable for the human eye. Result: maximum of 12000 points makes sense.
Therefore the question you are asking rather needs to be: How do I subsample my 20000000 data points to a set of points that still produces the same image on paper or screen.
The solution to this strongly depends on the nature of the data. If it is sufficiently smooth, you can just take every nth list entry.
sample = data[::n]
If there are high frequency components which need to be resolved, this would require more sophisticated techniques, which will again depend on how the data looks like.
One such technique might be the one shown in How can I subsample an array according to its density? (Remove frequent values, keep rare ones).
The following approach might give you a small improvement. It removes doing the split twice per row (by using Python's CSV library) and also removes the if statement by skipping over the two header lines before doing the loop:
import matplotlib.pyplot as plt
import csv
l1, l2 = [], []
with open('input.csv', 'rb') as f_input:
csv_input = csv.reader(f_input)
# Skip two header lines
next(csv_input)
next(csv_input)
for cols in csv_input:
l1.append(cols[0])
l2.append(cols[1])
plt.plot(l1, 'r')
plt.plot(l2, 'g')
plt.show()
I would say the main slow down though will still be the plot itself.
I would recommend switching to pyqtgraph. I switched to it because of speed issues while I was trying to make matplotlib plot real time data. Worked like a charm. Here's my real time plotting example.

Why does this loop in python runs progressively slower?

In this code, there is a 4-D array of 13x13 images. I would like to save each 13x13 image using matplotlib.pyplot. Here for debugging purposes, I limit the outer loop to 1.
#fts is a numpy array of shape (4000,100,13,13)
no_images = 4000
for m in [1]:
for i in range(no_images):
print i,
fm = fts[i][m]
if fm.min() != fm.max():
fm -= fm.min()
fm /= fm.max() #scale to [0,1]
else:
print 'unscaled'
plt.imshow(fmap)
plt.savefig('m'+str(m)+'_i'+str(i)+'.png')
Saving 4000 images took more than 20 hours. Why is it this slow?
If I limit the inner loop to the first 100 images, it takes about 1 minute. So the whole thing should be completed in 40 minutes, not over 20 hours! And I notice it seems to run progressively slower.
What you experience here is a memory leak: you keep creating instances of AxesImage objects (by repetitively calling plt.imshow) to the moment they can't fit into RAM; and then the whole thing begins swapping to disk, which is incredibly slow. To avoid memory leaks, you can either destroy AxesImage instance when you don't need it:
...
image = plt.imshow(fmap)
plt.savefig('m'+str(m)+'_i'+str(i)+'.png')
del(image)
Or, alternatively, you can create only one AxesImage, and then just change the data in it:
...
image = None
for m in [1]:
for i in range(no_images):
...
if image is None:
image = plt.imshow(fmap)
else:
image.set_data(fmap)
...
I have got the same issue and I tried the above solutions but my dataset is too big for my ram it just collapses after running 20000 images and then I got the answer both plt.close() and del image are not working because they are not clearing total data which is stored they the just adding strain to ram to clear total plt data we need to use plt.figure().clear(), plt.close(), plt.cla(), plt.clf()
This might work for you
#fts is a numpy array of shape (4000,100,13,13)
no_images = 4000
for m in [1]:
for i in range(no_images):
print i,
fm = fts[i][m]
if fm.min() != fm.max():
fm -= fm.min()
fm /= fm.max() #scale to [0,1]
else:
print 'unscaled'
plt.imshow(fmap)
plt.savefig('m'+str(m)+'_i'+str(i)+'.png')
plt.figure().clear()
plt.close()
plt.cla()
plt.clf()

matplotlib MemoryError on pcolorfast

The Problem:
I'm currently loading column data from text files into numpy arrays, and then plotting them and saving the resulting image. Because the values will always lie on an equally spaced grid, it seemed an appropriate time to use pcolorfast. Each array is necessarily square, usually between 1024x1024 and 8192x8192. At present, I'm only concerned with this working up to and including 4096x4096 sizes. This needs to be done for hundreds of files, and while it successfully completes the fist image, subsequent images crash on a MemoryError.
Unsuccessful solutions:
I've ensured, as per here, that I have hold = False in rc.
Limitations:
The images must be saved using all 4096x4096 values, and cannot be scaled down to 1024x1024 (as suggested here).
Notes:
After watching memory usage during each phase (create empty array, load values, plot, save), the array A is still sitting in memory after makeFrame is complete. Is an explicit call to delete it required? Does fig need to be explicitly deleted, or should pylab take care of that? The ideal situation (probably obvious) would be to have memory usage return to ~the same level as it was before the call to makeFrame().
Any and all advice is greatly appreciated. I've been trying to resolve this for a few days, so it's not unlikely I've missed something obvious. And obvious solutions would be exciting (if the alternative should be that this is a more complicated problem).
Current code sample:
import numpy
import matplotlib
matplotlib.use("AGG")
import matplotlib.pylab as plt
def makeFrame(srcName, dstName, coloring, sideLength,
dataRanges, delim, dpi):
v,V,cmap = coloring
n = sideLength
xmin,xmax,ymin,ymax = dataRanges
A = numpy.empty((n,n),float)
dx = (xmax-xmin) / (n-1)
dy = (ymax-ymin) / (n-1)
srcfile = open(srcName,'rb')
for line in srcfile:
lineVals = line[:-1].split(delim)
x = float(lineVals[0])
y = float(lineVals[1])
c = float(lineVals[2])
#Find index from float value, adjust for rounding
i = (x-xmin) / dx
if (i - int(i) ) > .05: i += 1
j = (y-ymin) / dy
if (j - int(j) ) > .05: j += 1
A[i,j] = c
srcfile.close()
print "loaded vals"
fig = plt.figure(1)
fig.clf()
ax = fig.gca()
ScalarMap = ax.pcolorfast(A, vmin = v, vmax = V, cmap = cmap)
fig.colorbar(ScalarMap)
ax.axis('image')
fig.savefig(dstName, dpi = dpi)
plt.close(1)
print "saved image"
Caveats:
There might be a better way to deal
with this memory problem that I don't
know about.
I haven't been able to reproduce this
error. When I use
matplotlib.cbook.report_memory() my
memory usage seems to level out as
expected.
Despite the caveats, I thought I'd mention a general, cheap method of dealing with problems caused by a program refusing to release memory: Use the multiprocessing module to spawn the problematic function in a separate process. Wait for the function to end, then call it again. Each time a subprocess ends, you regain the memory it used.
So I suggest trying something like this:
import matplotlib.cbook as mc
import multiprocessing as mp
import matplotlib.cm as cm
if __name__=='__main__':
for _ in range(10):
srcName='test.data'
dstName='test.png'
vmin = 0
vmax = 5
cmap = cm.jet
sideLength = 500
dataRanges = (0.0,1.0,0.0,1.0)
delim = ','
dpi = 72
proc=mp.Process(target=makeFrame,args=(
srcName,dstName,(vmin,vmax,cmap),sideLength,
dataRanges,delim,dpi))
proc.start()
proc.join()
usage=mc.report_memory()
print(usage)

Categories

Resources