Memory leak in pandas multiIndex (with minmum reproducible example) - python

Eariler today I posted this question. I now have a MRE that can reproduce the issue.
In short, this piece of code seems to be using much more memory than it should (the idea is to average some number of time-traces into a certain number of bins. The traces are arranged in a matrix using a pd.MultiIndex).
import numpy as np
import pandas as pd
#Len of each trace
trace_len =2500
#Number of Bins
bin_num = 300
# Traces matrix dimensions
L1 = 70
L2 = 100
index = pd.MultiIndex.from_product([range(L1), range(L2)])
traces = np.random.random((L1*L2, trace_len))
traces_df = pd.DataFrame(traces, index=index)
#Lets make 300 random bins
bins = [index.to_frame().sample(frac=1, replace=True) for _ in range(bin_num)]
bins = [pd.MultiIndex.from_frame(bin) for bin in bins]
def bin_single(traces: pd.DataFrame, bin_idx:pd.Index) -> np.array:
""" Cumulative sum of all shots that are both in traces and bin_idx"""
bin_idx = bin_idx.intersection(traces.index)
binned = traces.reindex(bin_idx, copy=False)
return binned.sum(axis=0, skipna=False).to_numpy()
output = np.empty((bin_num, trace_len))
for n, bin in enumerate(bins):
output[n] = bin_single(traces_df, bin)
print(output.nbytes)
This is the memory allocation over time:
The issue cannot be due to lazy allocation of output, since that array is only 6Mb, as reported by output.nbytes, while the overall memory allocation grows by more than 200Mb over the for loop.
I think the problem might be hidden in the pd.MultiIndex usage, since this very similar program that does not use MultiIndex does not show the memory increase:
import numpy as np
import pandas as pd
#Len of each trace
trace_len =2500
#Number of Bins
bin_num = 300
# Traces matrix dimensions
L1 = 70
L2 = 100
#index = pd.MultiIndex.from_product([range(L1), range(L2)])
traces = np.random.random((L1*L2, trace_len))
traces_df = pd.DataFrame(traces)
index = traces_df.index
#Lets make 300 random bins
bins = [index.to_frame().sample(frac=1, replace=True) for _ in range(bin_num)]
bins = [pd.MultiIndex.from_frame(bin) for bin in bins]
def bin_single(traces: pd.DataFrame, bin_idx:pd.Index) -> np.array:
""" Cumulative sum of all shots that are both in traces and bin_idx"""
bin_idx = bin_idx.intersection(traces.index)
binned = traces.reindex(bin_idx, copy=False)
return binned.sum(axis=0, skipna=False).to_numpy()
output = np.empty((bin_num, trace_len))
for n, bin in enumerate(bins):
output[n] = bin_single(traces_df, bin)
print(output.nbytes)
I tend to think that there might be a bug somewhere in pd.MultiIndex, but maybe I'm just overlooking something.
Thanks a lot!

Related

Filling a 3D Array and Plotting the Values

I would like to write a code in Python that evaluates the time evolution of a density distribution, p(x,y). The initial conditions is p(t=0,x,y)=exp[-((x-500)^2)/500] and the formula for the solution is in the code below: t-time index, i-space index (x-direction), j-space index (y-direction), and v=0.8
My goal is to run the scheme for 10 iterations and plot the results at the final time step (t=9). What I'm getting is a big array just filled with zeros. I think it's because I am not using the 3D arrays correctly, does anyone have any suggestions? Thank you.
My attempt:
import numpy as np
import matplotlib.pyplot as plt
#Input Parameters
Nx = 1000 #number of grid points in x-direction
Ny = 500 #number of grid points in y-direction
T = 10 #number of time steps
v = 0.8
p = np.zeros((T,Nx,Ny))
P = np.zeros((T,Nx,Ny))
for t in range(0,T-1):
for i in range(0,Nx-1):
for j in range(0,Ny-1):
P[t,i,j] = p[t,i,j]-((v/2)*(p[t,i+1,j]-p[t,i,j]))
p[0,i,j] = np.exp(((-1*(i-500))**2)/500)
x = P[9,i]
y = P[9,j]
print(x)
plt.plot(x,y)
plt.xlim([0,1000])
plt.ylim([0,500])
plt.xlabel('x-direction')
plt.ylabel('y-direction')
plt.title("Density Distribution After 10 Iterations")
Looks like you only fill the values for t in range(0,T-1) which stops at T=8, and you are trying to get x = P[9,i]. They never get filled so obviously they are all 0.
Try to use range(0, T), it will loop over 0,1,2,...,T-1. Also change range(0,Nx), range(0,Ny)

How to reduce memory usage in xarray multidimensional rolling aggregation?

Given a multidimensional xarray DataArray, I would like to perform multidimensional rolling aggregation. For example, if I have a DataArray that is m x n x k, I would like to be able to roll the data along the m axis, and aggregate away either the n or k dimension.
I have an approach that gives me the correct answer but seems not to scale at all. If my window sizes are small, it is feasible, but in the case of a 5000 x 2000 x 10 DataArray, rolling along the 5000 length dimension with a long window explodes memory with my current approach.
import xarray as xr
import numpy as np
import pandas as pd
drange = pd.date_range(start='2000-01-01', freq='D', periods=5000)
x = ['x%i' % i for i in range(1, 3001)]
y = ['y%i' % i for i in range(1,11)]
raw_dat = np.random.randn(len(drange), len(x), len(y))
da = xr.DataArray(raw_dat, coords={'time': drange, 'x': x, 'y': y}, dims=['time', 'x', 'y'])
new_da = da.rolling(time=20).construct('window_dim')
final_da = new_da.stack(combo=['x', 'window_dim']).std('combo')
I have also tried the below, it gives the same result but also runs out of memory when the rolling window is large.
new_da = da.rolling(time=20).construct('window_dim')
final_da = new_da.std(['x', 'window_dim'])
The above code works and on my machine takes roughly 35 seconds to perform the stack and aggregation, but as window size increases, memory usage explodes. I am wondering if there is a smarter way to do this type of aggregation.

How do I force two arrays to be equal for use in pyplot?

I'm trying to plot a simple moving averages function but the resulting array is a few numbers short of the full sample size. How do I plot such a line alongside a more standard line that extends for the full sample size? The code below results in this error message:
ValueError: x and y must have same first dimension, but have shapes (96,) and (100,)
This is using standard matplotlib.pyplot. I've tried just deleting X values using remove and del as well as switching all arrays to numpy arrays (since that's the output format of my moving averages function) then tried adding an if condition to the append in the while loop but neither has worked.
import random
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
def movingaverage(values, window):
weights = np.repeat(1.0, window) / window
smas = np.convolve(values, weights, 'valid')
return smas
sampleSize = 100
min = -10
max = 10
window = 5
vX = np.array([])
vY = np.array([])
x = 0
val = 0
while x < sampleSize:
val += (random.randint(min, max))
vY = np.append(vY, val)
vX = np.append(vX, x)
x += 1
plt.plot(vX, vY)
plt.plot(vX, movingaverage(vY, window))
plt.show()
Expected results would be two lines on the same graph - one a simple moving average of the other.
Just change this line to the following:
smas = np.convolve(values, weights,'same')
The 'valid' option, only convolves if the window completely covers the values array. What you want is 'same', which does what you are looking for.
Edit: This, however, also comes with its own issues as it acts like there are extra bits of data with value 0 when your window does not fully sit on top of the data. This can be ignored if chosen, as is done in this solution, but another approach is to pad the array with specific values of your choosing instead (see Mike Sperry's answer).
Here is how you would pad a numpy array out to the desired length with 'nan's (replace 'nan' with other values, or replace 'constant' with another mode depending on desired results)
https://docs.scipy.org/doc/numpy/reference/generated/numpy.pad.html
import numpy as np
bob = np.asarray([1,2,3])
alice = np.pad(bob,(0,100-len(bob)),'constant',constant_values=('nan','nan'))
So in your code it would look something like this:
import random
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
def movingaverage(values,window):
weights = np.repeat(1.0,window)/window
smas = np.convolve(values,weights,'valid')
shorted = int((100-len(smas))/2)
print(shorted)
smas = np.pad(smas,(shorted,shorted),'constant',constant_values=('nan','nan'))
return smas
sampleSize = 100
min = -10
max = 10
window = 5
vX = np.array([])
vY = np.array([])
x = 0
val = 0
while x < sampleSize:
val += (random.randint(min,max))
vY = np.append(vY,val)
vX = np.append(vX,x)
x += 1
plt.plot(vX,vY)
plt.plot(vX,(movingaverage(vY,window)))
plt.show()
To answer your basic question, the key is to take a slice of the x-axis appropriate to the data of the moving average. Since you have a convolution of 100 data elements with a window of size 5, the result is valid for the last 96 elements. You would plot it like this:
plt.plot(vX[window - 1:], movingaverage(vY, window))
That being said, your code could stand to have some optimization done on it. For example, numpy arrays are stored in fixed size static buffers. Any time you do append or delete on them, the entire thing gets reallocated, unlike Python lists, which have amortization built in. It is always better to preallocate if you know the array size ahead of time (which you do).
Secondly, running an explicit loop is rarely necessary. You are generally better off using the under-the-hood loops implemented at the lowest level in the numpy functions instead. This is called vectorization. Random number generation, cumulative sums and incremental arrays are all fully vectorized in numpy. In a more general sense, it's usually not very effective to mix Python and numpy computational functions, including random.
Finally, you may want to consider a different convolution method. I would suggest something based on numpy.lib.stride_tricks.as_strided. This is a somewhat arcane, but very effective way to implement a sliding window with numpy arrays. I will show it here as an alternative to the convolution method you used, but feel free to ignore this part.
All in all:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
def movingaverage(values, window):
# this step creates a view into the same buffer
values = np.lib.stride_tricks.as_strided(values, shape=(window, values.size - window + 1), strides=values.strides * 2)
smas = values.sum(axis=0)
smas /= window # in-place to avoid temp array
return smas
sampleSize = 100
min = -10
max = 10
window = 5
v_x = np.arange(sampleSize)
v_y = np.cumsum(np.random.random_integers(min, max, sampleSize))
plt.plot(v_x, v_y)
plt.plot(v_x[window - 1:], movingaverage(v_y, window))
plt.show()
A note on names: in Python, variable and function names are conventionally name_with_underscore. CamelCase is reserved for class names. np.random.random_integers uses inclusive bounds just like random.randint, but allows you to specify the number of samples to generate. Confusingly, np.random.randint has an exclusive upper bound, more like random.randrange.

Using python to take a 32x32 matrices append many of these matrices to a single array then adding a timestamp index to each matrix

I am very new to coding python and I am working with a .CSV file that gives me a 32x32 matrix in a 1024 column row with a time stamp. I reshaped the data to give me 32x32 arrays and looped through each row appending the matrices to a numpy array.
`i = 0
while i < len(df_array):
if i == 0:
spec = np.reshape(df_array[i][np.arange(1,1025)], (32,32))
spectrum_matrix = spec
else:
spec = np.reshape(df_array[i][np.arange(1,1025)], (32,32))
spectrum_matrix = np.concatenate((spectrum_matrix, spec), axis = 0)
i = i + 1
print("job done")`
What I would like to do is to add the time stamp from the original data file and add them to each of the matrices thus allowing me to re sample the data over a 5 minute average. I also would like to plot the bins a to get a plot similar to this Drop size distribution
As a reference I am reading in the data .CSV with pandas and here is an example of a portion of the raw data: 01.06.2017;18:22:20;0.122;0.00;51;7.401;10375;18745;57;27;0.00;23.6;0.110;0;
<SPECTRUM>;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
The ;'s after the SPECTRUM is the 32x32 matrix.
Thanks in advance for any help!
Python and associated packages can do many things without loops
From my understanding of your data you have a (8640 x 32 x 32) Data Structure (time x size x velocity).
Pandas works very well with 2D data structures, however for higher dimensional data I would recommend you get familiar with xarray. With this package along with pandas you can create and manipulate your data without having to resort to loops.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import seaborn as sns
%matplotlib inline
#create random data
data = (np.random.binomial(n =5, p =0.2, size =(8640,32,32))*1000).astype(int)
#create labels for data
sizes= np.linspace(1,5,32)
velocities = np.linspace(1,1000, num = 32)
#make time range of 24 hours with 10sec intervals
ind = pd.date_range(start='2014-01-01', periods=8640, freq='10s')
#convert data to xarray 3D data structure
df = xr.DataArray(data, coords = [ind, sizes, velocities],
dims = ['time', 'size', 'speed'])
#make a 5 min average of the data
min_average= df.resample('300s', dim = 'time', how = 'mean')
#plot sample of data and 5 min average
my1d = min_average.isel(size = 5, speed= 10)
my1d.plot(label = '5 min avg')
plt.gca()
df.isel(size = 5, speed =10).plot(alpha = 0.3, c = 'r', label = 'raw_data')
plt.legend()
As for making a distribution plot like you linked things become a bit trickier but is possible:
#transform your data to only have mean speed for each time and size
#and convert to pandas dataframe
mean_speed =min_average.mean(dim = ['speed'])
#for some reason xarray make you name the new column when you convert
#to a pandas dataframe. I then get rid of the extra empty variable with
#a list comprehension
df= mean_speed.to_dataframe('').unstack().T
df.index = np.array([np.array(i)[1].astype(float) for i in df.index])
#make a contourplot of your new data
plt.contourf(df.columns, df.index, df.values, cmap ='PuBu_r')
plt.title('mean speed')
plt.ylabel('size')
plt.xlabel('time')
plt.colorbar()

Report a two-sample K-S statistic from two precomputed histograms

Problem:
Here I plot 2 datasets stored in textfiles (in list dataset) each containing 21.8 billion data points. This makes the data too large to hold in memory as an array. I am still able to graph them as histograms, but I'm unsure how to calculate their difference via a 2 sample KS test. This is because I cannot figure out how to access each histogram in the plt object.
Example:
Here is some code to generate dummy data:
mu = [100, 120]
sigma = 30
dataset = ['gsl_test_1.txt', 'gsl_test_2.txt']
for idx, file in enumerate(dataset):
dist = np.random.normal(mu[idx], sigma, 10000)
with open(file, 'w') as g:
for s in dist:
g.write('{}\t{}\t{}\n'.format('stuff', 'stuff', str(s)))
This generates my two histograms (made possible here):
chunksize = 1000
dataset = ['gsl_test_1.txt', 'gsl_test_2.txt']
for fh in dataset:
# find the min, max, line qty, for bins
low = np.inf
high = -np.inf
loop = 0
for chunk in pd.read_table(fh, header=None, chunksize=chunksize, delimiter='\t'):
low = np.minimum(chunk.iloc[:, 2].min(), low)
high = np.maximum(chunk.iloc[:, 2].max(), high)
loop += 1
lines = loop*chunksize
nbins = math.ceil(math.sqrt(lines))
bin_edges = np.linspace(low, high, nbins + 1)
total = np.zeros(nbins, np.int64) # np.ndarray filled with np.uint32 zeros, CHANGED TO int64
for chunk in pd.read_table(fh, header=None, chunksize=chunksize, delimiter='\t'):
# compute bin counts over the 3rd column
subtotal, e = np.histogram(chunk.iloc[:, 2], bins=bin_edges) # np.ndarray filled with np.int64
# accumulate bin counts over chunks
total += subtotal
plt.hist(bin_edges[:-1], bins=bin_edges, weights=total)
plt.savefig('gsl_test_hist.svg')
Question:
Most examples for KS-statistics employ two arrays of raw data/observations/points/etc, but I don't have enough memory to use this approach. Per the example above, how can I access these precomputed bins (from 'gsl_test_1.txt' and 'gsl_test_2.txt' to compute the KS statistic between the two distributions?
Bonus karma:
Record the KS statistic and pvalue on the graph!
i cleaned up your code a bit. writing to StringIO so it's more streamline than writing to a file. set the default vibe w/ seaborn instead of matplotlib to make it look more modern. the bins thresholds should be the same for both samples if you want the stat test to line up. i think if you iterate through and make the bins this way the whole thing may take way longer than it needs to. Counter could be useful b/c you'll only have to loop through once...plus you'll be able to make the same bin size. converting floats to ints since you are binning them together. from collections import Counter then C = Counter() and C[value] += 1. you'll have a dict at the end where you can make the bins from the list(C.keys()). this would be good since your data is so gnarly. also, you should see if there is a way to do chunksize with numpy instead of pandas b/c numpy is WAY faster at indexing. try a %timeit for DF.iloc[i,j] and ARRAY[i,j] and you'll see what i mean. i wrote much of it a function to try making it more modular.
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from io import StringIO
from scipy.stats import ks_2samp
import seaborn as sns; sns.set()
%matplotlib inline
#Added seaborn b/c it looks mo betta
mu = [100, 120]
sigma = 30
def write_random(file,mu,sigma=30):
dist = np.random.normal(mu, sigma, 10000)
for i,s in enumerate(dist):
file.write('{}\t{}\t{}\n'.format("label_A-%d" % i, "label_B-%d" % i, str(s)))
return(file)
#Writing to StringIO instead of an actual file
gs1_test_1 = write_random(StringIO(),mu=100)
gs1_test_2 = write_random(StringIO(),mu=120)
chunksize = 1000
def make_hist(fh,ax):
# find the min, max, line qty, for bins
low = np.inf
high = -np.inf
loop = 0
fh.seek(0)
for chunk in pd.read_table(fh, header=None, chunksize=chunksize, sep='\t'):
low = np.minimum(chunk.iloc[:, 2].min(), low) #btw, iloc is way slower than numpy array indexing
high = np.maximum(chunk.iloc[:, 2].max(), high) #you might wanna import and do the chunks with numpy
loop += 1
lines = loop*chunksize
nbins = math.ceil(math.sqrt(lines))
bin_edges = np.linspace(low, high, nbins + 1)
total = np.zeros(nbins, np.int64) # np.ndarray filled with np.uint32 zeros, CHANGED TO int64
fh.seek(0)
for chunk in pd.read_table(fh, header=None, chunksize=chunksize, delimiter='\t'):
# compute bin counts over the 3rd column
subtotal, e = np.histogram(chunk.iloc[:, 2], bins=bin_edges) # np.ndarray filled with np.int64
# accumulate bin counts over chunks
total += subtotal
plt.hist(bin_edges[:-1], bins=bin_edges, weights=total,axes=ax,alpha=0.5)
return(ax,bin_edges,total)
#Make the plot canvas to write on to give it to the function
fig,ax = plt.subplots()
test_1_data = make_hist(gs1_test_1,ax)
test_2_data = make_hist(gs1_test_2,ax)
#test_1_data[1] == test_2_data[1] The bins should be the same if you're going try and compare them...
ax.set_title("ks: %f, p_in_the_v: %f" % ks_2samp(test_1_data[2], test_2_data[2]))

Categories

Resources