I have some very large data to deal with. I'd like to be able to use np.load(filename, mmap_mode="r+") to use these files on disk rather than RAM. My issue is that creating them in RAM causes the exact problem I'm trying to avoid.
I know about np.memmap already and that is a potential solution, but creating a memmap and then saving the array using np.save(filename, memmap) means that I'd be doubling the disk space requirement even if only briefly and that isn't always an option. Primarily I don't want to use memmaps because the header information in .npy files (namely shape and dtype) is useful to have.
My question is, can I create a numpy file without needing to first create it in memory? That is, can I create a numpy file by just giving a dtype and a shape? The idea would be along the lines of np.save(filename, np.empty((x, y, z))) but I'm assuming that empty requires it to be assigned in memory before saving.
My current solution is:
def create_empty_numpy_file(filename, shape, dtype=np.float64):
with tempfile.TemporaryFile() as tmp:
memmap = np.memmap(tmp, dtype, mode="w+", shape=shape)
np.save(filename, memmap)
EDIT
My final solution based on bnaeker's answer and a few details from numpy.lib.format:
class MockFlags:
def __init__(self, shape, c_contiguous=True):
self.c_contiguous = c_contiguous
self.f_contiguous = (not c_contiguous) or (c_contiguous and len(shape) == 1)
class MockArray:
def __init__(self, shape, dtype=np.float64, c_contiguous=True):
self.shape = shape
self.dtype = np.dtype(dtype)
self.flags = MockFlags(shape, c_contiguous)
def save(self, filename):
if self.dtype.itemsize == 0:
buffersize = 0
else:
# Set buffer size to 16 MiB to hide the Python loop overhead.
buffersize = max(16 * 1024 ** 2 // self.dtype.itemsize, 1)
n_chunks, remainder = np.divmod(
np.product(self.shape) * self.dtype.itemsize, buffersize
)
with open(filename, "wb") as f:
np.lib.format.write_array_header_2_0(
f, np.lib.format.header_data_from_array_1_0(self)
)
for chunk in range(n_chunks):
f.write(b"\x00" * buffersize)
f.write(b"\x00" * remainder)
The Numpy file format is really simple. There are a few under-documented functions you can use to create the required header bytes from the metadata needed to build an array, without actually building one.
import numpy as np
def create_npy_header_bytes(
shape, dtype=np.float64, fortran_order=False, format_version="2.0"
):
# 4 or 2-byte unsigned integer, depending on version
n_size_bytes = 4 if format_version[0] == "2" else 2
magic = b"\x93NUMPY"
version_info = (
int(each).to_bytes(1, "little") for each in format_version.split(".")
)
# Keys are supposed to be alphabetically sorted
header = {
"descr": np.lib.format.dtype_to_descr(np.dtype(dtype)),
"fortran_order": fortran_order,
"shape": shape
}
# Pad header up to multiple of 64 bytes
header_bytes = str(header).encode("ascii")
header_len = len(header_bytes)
current_length = header_len + len(magic) + 2 + n_size_bytes # for version information
required_length = int(np.ceil(current_length / 64.0) * 64)
padding = required_length - current_length - 1 # For newline
header_bytes += b" " * padding + b"\n"
# Length of the header dict, including padding and newline
length = len(header_bytes).to_bytes(n_size_bytes, "little")
return b"".join((magic, *version_info, length, header_bytes))
You can test that it's equivalent with this snippet:
import numpy as np
import io
x = np.zeros((10, 3, 4))
first = create_npy_header_bytes(x.shape)
stream = io.BytesIO()
np.lib.format.write_array_header_2_0(
stream, np.lib.format.header_data_from_array_1_0(x)
)
print(f"Library: {stream.getvalue()}")
print(f"Custom: {first}")
You should see something like:
Library: b"\x93NUMPY\x02\x00t\x00\x00\x00{'descr': '<f8', 'fortran_order': False, 'shape': (10, 3, 4), } \n"
Custom: b"\x93NUMPY\x02\x00t\x00\x00\x00{'descr': '<f8', 'fortran_order': False, 'shape': (10, 3, 4)} \n"
which match, except for the trailing comma inside the header dict representation. That will not matter, as this is required to be a valid Python literal string representation of a dict, which will happily ignore that comma if it's there.
As an alternative approach, you could mock out an object which has the required fields for the library functions used to make the header itself. For np.lib.format.header_data_from_array_1_0, these seem to be .flags (which must have a field c_contiguous and/or f_contiguous), and a dtype. That's actually much simpler, and would look like:
import numpy as np
import io
class MockFlags:
def __init__(self, shape, c_contiguous=True):
self.c_contiguous = c_contiguous
self.f_contiguous = (not c_contiguous) or (c_contiguous and len(shape) == 1)
class MockArray:
def __init__(self, shape, dtype=np.float64, c_contiguous=True):
self.shape = shape
self.dtype = np.dtype(dtype)
self.flags = MockFlags(shape, c_contiguous)
mock = MockArray((10, 3, 4))
stream = io.BytesIO()
np.lib.format.write_array_header_2_0(
stream, np.lib.format.header_data_from_array_1_0(mock)
)
print(stream.getvalue())
You should see:
b"\x93NUMPY\x02\x00t\x00\x00\x00{'descr': '<f8', 'fortran_order': False, 'shape': (10, 3, 4), } \n"
which happily matches what we have above, but without having to do the shitty work of counting bytes, mucking with padding, etc. Much more betterer :)
Related
I'm running the following function for an ML model.
def get_images(filename):
bin_file = open(filename, 'rb')
buf = bin_file.read() # all the file are put into memory
bin_file.close() # release the measure of operating system
index = 0
magic, num_images, num_rows, num_colums = struct.unpack_from(big_endian + four_bytes, buf, index)
index += struct.calcsize(big_endian + four_bytes)
images = [] # temp images as tuple
for x in range(num_images):
im = struct.unpack_from(big_endian + picture_bytes, buf, index)
index += struct.calcsize(big_endian + picture_bytes)
im = list(im)
for i in range(len(im)):
if im[i] > 1:
im[i] = 1
However, I am receiving an error at the line:
im = struct.unpack_from(big_endian + picture_bytes, buf, index)
With the error:
error: unpack_from requires a buffer of at least 784 bytes
I have noticed this error is only occurring at certain iterations. I cannot figure out why this is might be the case. The dataset is a standard MNIST dataset which is freely available online.
I have also looked through similar questions on SO (e.g. error: unpack_from requires a buffer) but they don't seem to resolve the issue.
You didn't include the struct formats in your mre so it is hard to say why you are getting the error. Either you are using a partial/corrupted file or your struct formats are wrong.
This answer uses the test file 't10k-images-idx3-ubyte.gz' and file formats found at http://yann.lecun.com/exdb/mnist/
Open the file and read it into a bytes object (gzip is used because of the file's type).
import gzip,struct
with gzip.open(r'my\path\t10k-images-idx3-ubyte.gz','rb') as f:
data = bytes(f.read())
print(len(data))
The file format spec says the header is 16 bytes (four 32 bit ints) - separate it from the pixels with a slice then unpack it
hdr,pixels = data[:16],data[16:]
magic, num_images, num_rows, num_cols = struct.unpack(">4L",hdr)
# print(len(hdr),len(pixels))
# print(magic, num_images, num_rows, num_cols)
There are a number of ways to iterate over the individual images.
img_size = num_rows * num_cols
imgfmt = "B"*img_size
for i in range(num_images):
start = i * img_size
end = start + img_size
img = pixels[start:end]
img = struct.unpack(imgfmt,img)
# do work on the img
Or...
imgfmt = "B"*img_size
for img in struct.iter_unpack(imgfmt, pixels):
img = [p if p == 0 else 1 for p in img]
The itertools grouper recipe would probably also work.
I use the following code to save to text file:
filepath = open(filename, 'a')
np.savetxt(filepath, C, fmt='%i')
I came from C where I can control the size of the resulting file and know in advance. Hence, I want to understand how the size of the file is calculated in Python. My program generates a numpy matrix of shape (12500, 65) containing values 1 or -1. The resulting text file on disk has the (2,024,874 bytes) which does not make sense to me! Isn't supposed to be calculated as (assuming the size of a signed integer is 8 as I explicitly mention it as fmt='%i'): `12500 * 65 * 8 = 6,500,000 bytes'?
As mentioned by Mark, you're saving text, i.e. "1", not \x01\x00.... To demonstrate:
import io
import numpy as np
tenbyten = np.ones((10, 10), dtype=int)
myfile = io.BytesIO()
np.savetxt(myfile, tenbyten, fmt='%i')
len(myfile.getvalue()) # 200
myfile.getvalue()[:30] # b'1 1 1 1 1 1 1 1 1 1\n1 1 1 1 1 '
It's a string of ASCII number 1's and spaces, with newlines. Yours has some -'s mixed in I gather. If you want pure binary, you could do something like the following:
raw_data = tenbyten.tobytes() # .tofile() to go to a file instead of bytestring
len(raw_data) # 800
raw_data[:10] # b'\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00'
To get something that matches your 6.5 MB as an exercise, you could do the following: len(np.empty((12500, 65), dtype='int64').tobytes()) Note that the raw data is very raw, and discards all information about the data type, endianness, and shape, so the following is true:
np.ones((10, 10)).tobytes() == np.ones((5, 20)).tobytes() == np.ones(100).tobytes()
If you use np.save, that will save binary with the metadata
my_npy = io.BytesIO()
np.save(my_npy, tenbyten)
len(my_npy.getbuffer()) # 880
my_npy.getvalue()[:70]
# b"\x93NUMPY\x01\x00F\x00{'descr': '<i8', 'fortran_order': False, 'shape': (10, 10), "
For your case with +1/-1, forcing a datatype of int8 (with my_array.astype('int8')) is basically a free 8-fold data compression.
I am new to both Matlab and Python and I have to convert a program in Matlab to Python. I am not sure how to typecast the data after reading from the file in Python. The file used is a binary file.
Below is the Matlab code:
fid = fopen (filename, 'r');
fseek (fid, 0, -1);
meta = zeros (n, 9, 'single');
v = zeros (n, 128, 'single');
d = 0;
for i = 1:n
meta(i,:) = fread (fid, 9, 'float');
d = fread (fid, 1, 'int');
v(i,:) = fread (fid, d, 'uint8=>single');
end
I have written the below program in python:
fid = open(filename, 'r')
fid.seek(0 , 0)
meta = np.zeros((n,9),dtype = np.float32)
v = np.zeros((n,128),dtype = np.float32)
for i in range(n):
data_str = fid.read(9);
meta[1,:] = unpack('f', data_str)
For this unpack, I getting the error as
"unpack requires a string argument of length 4"
.
Please suggest someway to make it work.
I looked a little in the problem mainly because I need this in the near future, too. Turns out there is a very simple solution using numpy, assuming you have a matlab matrix stored like I do.
import numpy as np
def read_matrix(file_name):
return np.fromfile(file_name, dtype='<f') # little-endian single precision float
arr = read_matrix(file_path)
print arr[0:10] #sample data
print len(arr) # number of elements
The data type (dtype) you must find out yourself. Help on this is here. I used fwrite(fid,value,'single'); to store the data in matlab, if you have the same, the code above will work.
Note, that the returned variable is a list; you'll have to format it to match the original shape of your data, in my case len(arr) is 307200 from a matrix of the size 15360 x 20.
I'm trying to create random matrix and save it in binary file using numpy.save
Then I try to map this file using numpy.memmap, but it seems it maps it wrong.
How to fix it?
It seems it read .npy header and I need to scip some bytes from begining.
rows=6
cols=4
def create_matrix(rows,cols):
data = (np.random.rand(rows,cols)*100).astype('uint8') #type for image [0 255] int8?
return data
def save_matrix(filename, data):
np.save(filename, data)
def load_matrix(filename):
data= np.load(filename)
return data
def test_mult_ram():
A= create_matrix(rows,cols)
A[1][2]= 42
save_matrix("A.npy", A)
A= load_matrix("A.npy")
print A
B= create_matrix(cols,rows)
save_matrix("B.npy", B)
B= load_matrix("B.npy")
print B
fA = np.memmap('A.npy', dtype='uint8', mode='r', shape=(rows,cols))
fB = np.memmap('B.npy', dtype='uint8', mode='r', shape=(cols,rows))
print fA
print fB
UPDATE:
I just found that already np.lib.format.open_memmap function exist.
usage:
a = np.lib.format.open_memmap('A.npy', dtype='uint8', mode='r+')
If your goal is to open arrays you saved with np.save as memmaps, then you can just use np.load with the option mmap_mode:
fA = np.load('A.npy', mmap_mode='r')
fB = np.load('B.npy', mmap_mode='r')
This way you actually benefit from the header stored in the .npy files, in the sense that it keeps track of the shape and dtype of the array.
The npy format has a header that must be skipped when using np.memmap. It starts with an 6-byte magic string, '\x93NUMPY', 2 byte version number, followed by 2 bytes header length, followed by header data.
So if you open the file, find the header length, then you can compute the offset to pass to np.memmap:
def load_npy_to_memmap(filename, dtype, shape):
# npy format is documented here
# https://github.com/numpy/numpy/blob/master/doc/neps/npy-format.txt
with open(filename, 'r') as f:
# skip magic string \x93NUMPY + 2 bytes major/minor version number
# + 2 bytes little-endian unsigned short int
junk, header_len = struct.unpack('<8sh', f.read(10))
data= np.memmap(filename, dtype=dtype, shape=shape, offset=6+2+2+header_len)
return data
import struct
import numpy as np
np.random.seed(1)
rows = 6
cols = 4
def create_matrix(rows, cols):
data = (np.random.rand(
rows, cols) * 100).astype('uint8') # type for image [0 255] int8?
return data
def save_matrix(filename, data):
np.save(filename, data)
def load_matrix(filename):
data= np.load(filename)
return data
def load_npy_to_memmap(filename, dtype, shape):
# npy format is documented here
# https://github.com/numpy/numpy/blob/master/doc/neps/npy-format.txt
with open(filename, 'r') as f:
# skip magic string \x93NUMPY + 2 bytes major/minor version number
# + 2 bytes little-endian unsigned short int
junk, header_len = struct.unpack('<8sh', f.read(10))
data= np.memmap(filename, dtype=dtype, shape=shape, offset=6+2+2+header_len)
return data
def test_mult_ram():
A = create_matrix(rows, cols)
A[1][2] = 42
save_matrix("A.npy", A)
A = load_matrix("A.npy")
print A
B = create_matrix(cols, rows)
save_matrix("B.npy", B)
B = load_matrix("B.npy")
print B
fA = load_npy_to_memmap('A.npy', dtype='uint8', shape=(rows, cols))
fB = load_npy_to_memmap('B.npy', dtype='uint8', shape=(cols, rows))
print fA
print fB
np.testing.assert_equal(A, fA)
np.testing.assert_equal(B, fB)
test_mult_ram()
Problem:
Binary data of fixed size records
Want to use struct.unpack_from and struct.pack_into to manipulate the binary data
Want no copies of the data
Want multiple views into the memory to simply offset calculations etc.
Data could be in an array.array bytearray or ctypes string buffer
What I tried to do:
part1 = buffer(binary_data, 0, size1)
part2 = buffer(binary_data, size1, size2)
part3 = buffer(binary_data, size1 + size2) # no size is given for this one as it should consume the rest of the buffer
struct.pack_into('I', part3, 4, 42)
The problem here is that struct.pack_into complains about the buffers being read only. I have looked into memoryviews as they can create a read/write view however they don't allow you to specify the offset and size like the buffer function does.
How can I accomplish having multiple zero-copy views into a buffer of bytes that is readable,writable and can be accessed/modified using struct.unpack_from and struct.pack_into
In 2.6+, ctypes data types have a from_buffer method that takes an optional offset. It expects a writable buffer and will raise an exception otherwise. (For readonly buffers there's from_buffer_copy.) Here's a quick translation of your example to use ctypes char arrays:
from ctypes import *
import struct
binary_data = bytearray(24)
size1 = size2 = 4
size3 = len(binary_data) - size1 - size2
part1 = (c_char * size1).from_buffer(binary_data)
part2 = (c_char * size2).from_buffer(binary_data, size1)
part3 = (c_char * size3).from_buffer(binary_data, size1 + size2)
struct.pack_into('4I', part3, 0, 1, 2, 3, 4)
>>> binary_data[8:]
bytearray(b'\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00')
>>> struct.unpack_from('4I', part3)
(1, 2, 3, 4)