Python Read part of large binary file

Python Read part of large binary file - python

I have large binary file (size ~2.5Gb). It contains header (size 336 byte) and seismic signal data (x, y and z channels) with type int32. Count of discrete is 223 200 000.
I need read part of signal. For example, I want get part of signal in interval of discrete [216 000 000, 219 599 999].
I wrote the function:
def reading(path, start_moment, end_moment):
file_data = open(path, 'rb')
if start_moment is not None:
bytes_value = start_moment * 4 * 3
file_data.seek(336 + bytes_value)
else:
file_data.seek(336)
if end_moment is None:
try:
signals = np.fromfile(file_data, dtype=np.int32)
except MemoryError:
return None
finally:
file_data.close()
else:
moment_count = end_moment - start_moment + 1
try:
signals = np.fromfile(file_data, dtype=np.int32,
count=moment_count * 3)
except MemoryError:
return None
finally:
file_data.close()
channel_count = 3
signal_count = signals.shape[0] // channel_count
signals = np.reshape(signals, newshape=(signal_count, channel_count))
return signals
If I run script with the function in PyCharm IDE I get error:
Traceback (most recent call last): File
"D:/AppsBuilding/test/testReadBaikal8.py", line 41, in
signal_2 = reading(path=path, start_moment=216000000, end_moment=219599999) File
"D:/AppsBuilding/test/testReadBaikal8.py", line 27, in reading
count=moment_count * 3) OSError: obtaining file position failed
But if I run script with parameters: start_moment=7200000, end_moment=10799999 all ok.
On my PC was installed Windows7 32bit. Memory size is 1.95Gb
Please, help me resolve this problem.

Divide the file into small segments, freeing memory after each small
piece of content is processed
def read_in_block(file_path):
BLOCK_SIZE = 1024
with open(file_path, "r") as f:
while True:
block = f.read(BLOCK_SIZE)
if block:
yield block
else:
return
print block

I don't use Numpy but I don't see anything obviously wrong with your code. However, you say the file is approximately 2.5 GB in size. A triplet index of 219,599,999 requires a file at least 2.45 GB in size:
$ calc
; 219599999 * 4 * 3
2635199988
; 2635199988 / 1024^3
~2.45422123745083808899
Are you sure your file is really that large?
I also don't use MS Windows but the following toy programs work for me. The first creates a data file that mimics the structure of yours. The second shows that it can read the final data triplet. What happens if you run these on your system?
fh = open('x', 'wb')
fh.write(b'0123456789')
for i in range(0, 1000):
s = bytes('{:03d}'.format(i), 'ascii')
fh.write(b'a' + s + b'b' + s + b'c' + s)
Read the data from file x:
fh = open('x', 'rb')
triplet = 999
fh.seek(10 + triplet * 3 * 4)
data = fh.read(3 * 4)
print(data)

Related

struct.error: unpack requires a buffer of 2 bytes

Im trying to identify the musical note of a sound from a .wav file using python, but im getting the error above when using "struct"
I couldn't gather a lot of info from the documents for struct or other websites on how to resolve this issue.
I have seen errors like:
struct.error: unpack requires a buffer of 4 bytes
struct.error: unpack requires a buffer of 1024 bytes
but the error seems to be for a different reason.
import numpy as np
import math
import wave
import os
import struct
import matplotlib.pyplot as plt
def note_detect(audio_file):
#-------------------------------------------
#here we are just storing our sound file as a numpy array
#you can also use any other method to store the file as an np array
file_length=audio_file.getnframes()
f_s=audio_file.getframerate() #sampling frequency
sound = np.zeros(file_length) #blank array
for i in range(file_length) :
wdata=audio_file.readframes(1)
data=struct.unpack("<h",wdata)
sound[i] = int(data[0])
plt.plot(sound)
plt.show()
sound=np.divide(sound,float(2**15)) #scaling it to 0 - 1
counter = audio_file.getnchannels() #number of channels mono/sterio
#-------------------------------------------
plt.plot(sound)
plt.show()
#fourier transformation from numpy module
fourier = np.fft.fft(sound)
fourier = np.absolute(fourier)
imax=np.argmax(fourier[0:int(file_length/2)]) #index of max element
plt.plot(fourier)
plt.show()
#peak detection
i_begin = -1
threshold = 0.3 * fourier[imax]
for i in range (0,imax+100):
if fourier[i] >= threshold:
if(i_begin==-1):
i_begin = i
if(i_begin!=-1 and fourier[i]<threshold):
break
i_end = i
imax = np.argmax(fourier[0:i_end+100])
freq=(imax*f_s)/(file_length*counter) #formula to convert index into sound frequency
#frequency database
note=0
name = np.array(["C0","C#0","D0","D#0","E0","F0","F#0","G0","G#0","A0","A#0","B0","C1","C#1","D1","D#1","E1","F1","F#1","G1","G#1","A1","A#1","B1","C2","C#2","D2","D#2","E2","F2","F#2","G2","G2#","A2","A2#","B2","C3","C3#","D3","D3#","E3","F3","F3#","G3","G3#","A3","A3#","B3","C4","C4#","D4","D4#","E4","F4","F4#","G4","G4#","A4","A4#","B4","C5","C5#","D5","D5#","E5","F5","F5#","G5","G5#","A5","A5#","B5","C6","C6#","D6","D6#","E6","F6","F6#","G6","G6#","A6","A6#","B6","C7","C7#","D7","D7#","E7","F7","F7#","G7","G7#","A7","A7#","B7","C8","C8#","D8","D8#","E8","F8","F8#","G8","G8#","A8","A8#","B8","Beyond B8"])
frequencies = np.array([16.35,17.32,18.35,19.45,20.60,21.83,23.12,24.50,25.96 ,27.50 ,29.14 ,30.87 ,32.70 ,34.65 ,36.71 ,38.89 ,41.20 ,43.65 ,46.25 ,49.00 ,51.91 ,55.00 ,58.27 ,61.74 ,65.41 ,69.30 ,73.42 ,77.78 ,82.41 ,87.31 ,92.50 ,98.00 ,103.83 ,110.00 ,116.54 ,123.47 ,130.81 ,138.59 ,146.83 ,155.56 ,164.81 ,174.61 ,185.00 ,196.00 ,207.65 ,220.00 ,233.08 ,246.94 ,261.63 ,277.18 ,293.66 ,311.13 ,329.63 ,349.23 ,369.99 ,392.00 ,415.30 ,440.00 ,466.16 ,493.88 ,523.25 ,554.37 ,587.33 ,622.25 ,659.26 ,698.46 ,739.99 ,783.99 ,830.61 ,880.00 ,932.33 ,987.77 ,1046.50 ,1108.73 ,1174.66 ,1244.51 ,1318.51 ,1396.91 ,1479.98 ,1567.98 ,1661.22 ,1760.00 ,1864.66 ,1975.53 ,2093.00 ,2217.46 ,2349.32 ,2489.02 ,2637.02 ,2793.83 ,2959.96 ,3135.96 ,3322.44 ,3520.00 ,3729.31 ,3951.07 ,4186.01 ,4434.92 ,4698.64 ,4978.03 ,5274.04 ,5587.65 ,5919.91 ,6271.93 ,6644.88 ,7040.00 ,7458.62 ,7902.13,8000])
#searching for matched frequencies
for i in range(0,frequencies.size-1):
if(freq<frequencies[0]):
note=name[0]
break
if(freq>frequencies[-1]):
note=name[-1]
break
if freq>=frequencies[i] and frequencies[i+1]>=freq :
if freq-frequencies[i]<(frequencies[i+1]-frequencies[i])/2 :
note=name[i]
else :
note=name[i+1]
break
return note
if __name__ == "__main__":
path = os.getcwd()
file_name = path + "\\" + "recording0.wav"
audio_file = wave.open(file_name)
Detected_Note = note_detect(audio_file)
print("\n\tDetected Note = " + str(Detected_Note))
The full error on line 23:
Traceback (most recent call last):
File "C:\Users\m8\Desktop\programing_stuff\python-stuff\minecraft_flute_player - 12-08-2022\app.py", line 86, in <module>
Detected_Note = note_detect(audio_file)
File "C:\Users\m8\Desktop\programing_stuff\python-stuff\minecraft_flute_player - 12-08-2022\app.py", line 23, in note_detect
data=struct.unpack("<h",wdata)
struct.error: unpack requires a buffer of 2 bytes
Thanks for the help.

What I assume is happening here is the size of the frame isn't 2 bytes as you expected.
When stating <h you are stating that you are going to extract 2 bytes from each frame. See the stuct documentation for more on that.
You can use the getparams function to better understand the structure of the wav file.
>>> audio_file.getparams()
_wave_params(nchannels=1, sampwidth=2, framerate=44100, nframes=22050, comptype='NONE', compname='not compressed')
The parameters which are interesting are nchannels and sampwidth.
You can calculate sampwidth * nchannels to understand the amount of bytes you need to extract from the frame for this WAV file.
In this example, you have sampwidth * nchannels = 1 * 2 = 2 bytes per frame.
More information can be found in this answer which shows different cases of frame sizes.

unpack_from requires a buffer of at least 784 bytes

I'm running the following function for an ML model.
def get_images(filename):
bin_file = open(filename, 'rb')
buf = bin_file.read() # all the file are put into memory
bin_file.close() # release the measure of operating system
index = 0
magic, num_images, num_rows, num_colums = struct.unpack_from(big_endian + four_bytes, buf, index)
index += struct.calcsize(big_endian + four_bytes)
images = [] # temp images as tuple
for x in range(num_images):
im = struct.unpack_from(big_endian + picture_bytes, buf, index)
index += struct.calcsize(big_endian + picture_bytes)
im = list(im)
for i in range(len(im)):
if im[i] > 1:
im[i] = 1
However, I am receiving an error at the line:
im = struct.unpack_from(big_endian + picture_bytes, buf, index)
With the error:
error: unpack_from requires a buffer of at least 784 bytes
I have noticed this error is only occurring at certain iterations. I cannot figure out why this is might be the case. The dataset is a standard MNIST dataset which is freely available online.
I have also looked through similar questions on SO (e.g. error: unpack_from requires a buffer) but they don't seem to resolve the issue.

You didn't include the struct formats in your mre so it is hard to say why you are getting the error. Either you are using a partial/corrupted file or your struct formats are wrong.
This answer uses the test file 't10k-images-idx3-ubyte.gz' and file formats found at http://yann.lecun.com/exdb/mnist/
Open the file and read it into a bytes object (gzip is used because of the file's type).
import gzip,struct
with gzip.open(r'my\path\t10k-images-idx3-ubyte.gz','rb') as f:
data = bytes(f.read())
print(len(data))
The file format spec says the header is 16 bytes (four 32 bit ints) - separate it from the pixels with a slice then unpack it
hdr,pixels = data[:16],data[16:]
magic, num_images, num_rows, num_cols = struct.unpack(">4L",hdr)
# print(len(hdr),len(pixels))
# print(magic, num_images, num_rows, num_cols)
There are a number of ways to iterate over the individual images.
img_size = num_rows * num_cols
imgfmt = "B"*img_size
for i in range(num_images):
start = i * img_size
end = start + img_size
img = pixels[start:end]
img = struct.unpack(imgfmt,img)
# do work on the img
Or...
imgfmt = "B"*img_size
for img in struct.iter_unpack(imgfmt, pixels):
img = [p if p == 0 else 1 for p in img]
The itertools grouper recipe would probably also work.

Appending numpy arrays into two binary files

I want to create two binary files to append numpy arrays into each one of them during a loop. I wrote the following method (I use Python 2.7):
for _ in range(5):
C = np.random.rand(1, 5)
r = np.random.rand(1, 5)
with open("C.bin", "ab") as file1, open("r.bin", "ab") as file2:
# Append to binary files
np.array(C).tofile(file1)
np.array(r).tofile(file2)
# Now printing to check if appending is successful
C = np.load("C.bin")
r = np.load("r.bin")
print (C)
print (r)
However, I keep getting this error:
Traceback (most recent call last):
File "test.py", line 15, in <module>
C = np.load("C.bin")
File "/anaconda/lib/python2.7/site-packages/numpy/lib/npyio.py", line 429, in load
"Failed to interpret file %s as a pickle" % repr(file))
IOError: Failed to interpret file 'C.bin' as a pickle
I tried to fix it but I cannot see anything more. Any help is appreciated.
NOTE: I intentionally want to use np.load because later on I will be loading the dataset from the disk into a numpy array for further processing.

You should use the save method that is built in the numpy to store the array in the files. Here what your code should look like:
for _ in range(5):
C = np.random.rand(1, 5)
r = np.random.rand(1, 5)
np.save('C', C)
np.save('r', r)
# Now printing to check if appending is successful
C = np.load("C.npy")
r = np.load("r.npy")
print (C)
print (r)
del C, r
Please refer to the documentation https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.load.html

How do I remove the memory limit on openmpi processes?

I'm running a process with mpirun and 2 cores and it gets killed at the point when I'm mixing values between the two processes. Both processes use about 15% of the machines memory and even though the memory will increase when mixing, there should still be plenty of memory left. So I'm assuming that there is a limit on the amount of memory used for passing messages in between the processes. How do I find out what this limit is and how do I remove it?
The error message that I'm getting when mpirun dies is this:
File "Comm.pyx", line 864, in mpi4py.MPI.Comm.bcast (src/mpi4py.MPI.c:67787)
File "pickled.pxi", line 564, in mpi4py.MPI.PyMPI_bcast (src/mpi4py.MPI.c:31462)
File "pickled.pxi", line 93, in mpi4py.MPI._p_Pickle.alloc (src/mpi4py.MPI.c:26327)
SystemError: Negative size passed to PyBytes_FromStringAndSize
And this is the bit of the code that leads to the error:
sum_updates_j_k = numpy.zeros((self.col.J_total, self.K), dtype=numpy.float64))
comm.Reduce(self.updates_j_k, sum_updates_j_k, op=MPI.SUM)
sum_updates_j_k = comm.bcast(sum_updates_j_k, root=0)
The code usually works, it only runs into problems with larger amounts of data, which makes the size of the matrix that I'm exchanging between processes increase

The culprit is probably the following lines found in the code of PyMPI_bcast():
cdef int count = 0
...
if dosend: smsg = pickle.dump(obj, &buf, &count) # <----- (1)
with nogil: CHKERR( MPI_Bcast(&count, 1, MPI_INT, # <----- (2)
root, comm) )
cdef object rmsg = None
if dorecv and dosend: rmsg = smsg
elif dorecv: rmsg = pickle.alloc(&buf, count)
...
What happens here is that the object is first serialised at (1) using pickle.dump() and then the length of the pickled stream is broadcasted at (2).
There are two problems here and they both have to do with the fact that int is used for the length. The first problem is an integer cast inside pickle.dump and the other problem is that MPI_INT is used to transmit the length of the pickled stream. This limits the amount of data in your matrix to a certain size - namely the size that would result in a pickled object no bigger than 2 GiB (231-1 bytes). Any bigger object would result in an integer overflow and thus negative values in count.
This is clearly not an MPI issue but rather a bug in (or a feature of?) mpi4py.

I had the same problem with mpi4py recently. As pointed out by Hristo Iliev in his answer, it's a pickle problem.
This can be avoided by using the upper-case methods comm.Reduce(), comm.Bcast(), etc., which do not resort to pickle, as opposed to lower-case methods like comm.reduce(). As a bonus, upper case methods should be a bit faster as well.
Actually, you're already using comm.Reduce(), so I expect that switching to comm.Bcast() should solve your problem - it did for me.
NB: The syntax of upper-case methods is slightly different, but this tutorial can help you get started.
For example, instead of:
sum_updates_j_k = comm.bcast(sum_updates_j_k, root=0)
you would use:
comm.Bcast(sum_updates_j_k, root=0)

For such a case it is useful to have a function that can send numpy arrays in parts, e.g.:
from mpi4py import MPI
import math, numpy
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
def bcast_array_obj(obj = None, dtype = numpy.float64, root = 0):
"""Function for broadcasting of a numpy array object"""
reporter = 0 if root > 0 else 1
if rank == root:
for exp in range(11):
parts = pow(2, exp)
err = False
part_len = math.ceil(len(obj) / parts)
for part in range(parts):
part_begin = part * part_len
part_end = min((part + 1) * part_len, len(obj))
try:
comm.bcast(obj[part_begin: part_end], root = root)
except:
err = True
err *= comm.recv(source = reporter, tag = 2)
if err:
break
if err:
continue
comm.bcast(None, root = root)
print('The array was successfully sent in {} part{}'.\
format(parts, 's' if parts > 1 else ''))
return
sys.stderr.write('Failed to send the array even in 1024 parts')
sys.stderr.flush()
else:
obj = numpy.zeros(0, dtype = dtype)
while True:
err = False
try:
part_obj = comm.bcast(root = root)
except:
err = True
obj = numpy.zeros(0, dtype = dtype)
if rank == reporter:
comm.send(err, dest = root, tag = 2)
if err:
continue
if type(part_obj) != type(None):
frags = len(obj)
obj.resize(frags + len(part_obj))
obj[frags: ] = part_obj
else:
break
return obj
This function automatically determines optimal number of parts to break the input array.
For example,
if rank != 0:
z = bcast_array_obj(root = 0)
else:
z = numpy.zeros(1000000000, dtype = numpy.float64)
bcast_array_obj(z, root = 0)
outputs
The array was successfully sent in 4 parts

Apparently this is an issue in MPI itself and not in MPI4py. The actual variable which holds the size of the data being communicated is a signed 32 bit integer which will overflow to a negative value for around 2GB of data.
Maximum amount of data that can be sent using MPI::Send
It's been raised as an issue with MPI4py previously as well here.

Creating random binary files

I'm trying to use python to create a random binary file. This is what I've got already:
f = open(filename,'wb')
for i in xrange(size_kb):
for ii in xrange(1024/4):
f.write(struct.pack("=I",random.randint(0,sys.maxint*2+1)))
f.close()
But it's terribly slow (0.82 seconds for size_kb=1024 on my 3.9GHz SSD disk machine). A big bottleneck seems to be the random int generation (replacing the randint() with a 0 reduces running time from 0.82s to 0.14s).
Now I know there are more efficient ways of creating random data files (namely dd if=/dev/urandom) but I'm trying to figure this out for sake of curiosity... is there an obvious way to improve this?

IMHO - the following is completely redundant:
f.write(struct.pack("=I",random.randint(0,sys.maxint*2+1)))
There's absolutely no need to use struct.pack, just do something like:
import os
fileSizeInBytes = 1024
with open('output_filename', 'wb') as fout:
fout.write(os.urandom(fileSizeInBytes)) # replace 1024 with a size in kilobytes if it is not unreasonably large
Then, if you need to re-use the file for reading integers, then struct.unpack then.
(my use case is generating a file for a unit test so I just need a
file that isn't identical with other generated files).
Another option is to just write a UUID4 to the file, but since I don't know the exact use case, I'm not sure that's viable.

The python code you should write completely depends on the way you intend to use the random binary file. If you just need a "rather good" randomness for multiple purposes, then the code of Jon Clements is probably the best.
However, on Linux OS at least, os.urandom relies on /dev/urandom, which is described in the Linux Kernel (drivers/char/random.c) as follows:
The /dev/urandom device [...] will return as many bytes as are
requested. As more and more random bytes are requested without giving
time for the entropy pool to recharge, this will result in random
numbers that are merely cryptographically strong. For many
applications, however, this is acceptable.
So the question is, is this acceptable for your application ? If you prefer a more secure RNG, you could read bytes on /dev/random instead. The main inconvenient of this device: it can block indefinitely if the Linux kernel is not able to gather enough entropy. There are also other cryptographically secure RNGs like EGD.
Alternatively, if your main concern is execution speed and if you just need some "light-randomness" for a Monte-Carlo method (i.e unpredictability doesn't matter, uniform distribution does), you could consider generate your random binary file once and use it many times, at least for development.

Here's a complete script based on accepted answer that creates random files.
import sys, os
def help(error: str = None) -> None:
if error and error != "help":
print("***",error,"\n\n",file=sys.stderr,sep=' ',end='');
sys.exit(1)
print("""\tCreates binary files with random content""", end='\n')
print("""Usage:""",)
print(os.path.split(__file__)[1], """ "name1" "1TB" "name2" "5kb"
Accepted units: MB, GB, KB, TB, B""")
sys.exit(2)
# https://stackoverflow.com/a/51253225/1077444
def convert_size_to_bytes(size_str):
"""Convert human filesizes to bytes.
ex: 1 tb, 1 kb, 1 mb, 1 pb, 1 eb, 1 zb, 3 yb
To reverse this, see hurry.filesize or the Django filesizeformat template
filter.
:param size_str: A human-readable string representing a file size, e.g.,
"22 megabytes".
:return: The number of bytes represented by the string.
"""
multipliers = {
'kilobyte': 1024,
'megabyte': 1024 ** 2,
'gigabyte': 1024 ** 3,
'terabyte': 1024 ** 4,
'petabyte': 1024 ** 5,
'exabyte': 1024 ** 6,
'zetabyte': 1024 ** 7,
'yottabyte': 1024 ** 8,
'kb': 1024,
'mb': 1024**2,
'gb': 1024**3,
'tb': 1024**4,
'pb': 1024**5,
'eb': 1024**6,
'zb': 1024**7,
'yb': 1024**8,
}
for suffix in multipliers:
size_str = size_str.lower().strip().strip('s')
if size_str.lower().endswith(suffix):
return int(float(size_str[0:-len(suffix)]) * multipliers[suffix])
else:
if size_str.endswith('b'):
size_str = size_str[0:-1]
elif size_str.endswith('byte'):
size_str = size_str[0:-4]
return int(size_str)
if __name__ == "__main__":
input = {} #{ file: byte_size }
if (len(sys.argv)-1) % 2 != 0:
print("-- Provide even number of arguments --")
print(f'--\tGot: {len(sys.argv)-1}: "' + r'" "'.join(sys.argv[1:]) +'"')
sys.exit(2)
elif len(sys.argv) == 1:
help()
try:
for file, size_str in zip(sys.argv[1::2], sys.argv[2::2]):
input[file] = convert_size_to_bytes(size_str)
except ValueError as ex:
print(f'Invalid size: "{size_str}"', file=sys.stderr)
sys.exit(1)
for file, size_bytes in input.items():
print(f"Writing: {file}")
#https://stackoverflow.com/a/14276423/1077444
with open(file, 'wb') as fout:
while( size_bytes > 0 ):
wrote = min(size_bytes, 1024) #chunk
fout.write(os.urandom(wrote))
size_bytes -= wrote

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python Read part of large binary file - python

Divide the file into small segments, freeing memory after each small piece of content is processed def read_in_block(file_path): BLOCK_SIZE = 1024 with open(file_path, "r") as f: while True: block = f.read(BLOCK_SIZE) if block: yield block else: return print block

Related

struct.error: unpack requires a buffer of 2 bytes

unpack_from requires a buffer of at least 784 bytes

Appending numpy arrays into two binary files

How do I remove the memory limit on openmpi processes?

Creating random binary files

Categories

Resources