Python binary data reading

Python binary data reading - python

A urllib2 request receives binary response as below:
00 00 00 01 00 04 41 4D 54 44 00 00 00 00 02 41
97 33 33 41 99 5C 29 41 90 3D 71 41 91 D7 0A 47
0F C6 14 00 00 01 16 6A E0 68 80 41 93 B4 05 41
97 1E B8 41 90 7A E1 41 96 8F 57 46 E6 2E 80 00
00 01 16 7A 53 7C 80 FF FF
Its structure is:
DATA, TYPE, DESCRIPTION
00 00 00 01, 4 bytes, Symbol Count =1
00 04, 2 bytes, Symbol Length = 4
41 4D 54 44, 6 bytes, Symbol = AMTD
00, 1 byte, Error code = 0 (OK)
00 00 00 02, 4 bytes, Bar Count = 2
FIRST BAR
41 97 33 33, 4 bytes, Close = 18.90
41 99 5C 29, 4 bytes, High = 19.17
41 90 3D 71, 4 bytes, Low = 18.03
41 91 D7 0A, 4 bytes, Open = 18.23
47 0F C6 14, 4 bytes, Volume = 3,680,608
00 00 01 16 6A E0 68 80, 8 bytes, Timestamp = November 23,2007
SECOND BAR
41 93 B4 05, 4 bytes, Close = 18.4629
41 97 1E B8, 4 bytes, High = 18.89
41 90 7A E1, 4 bytes, Low = 18.06
41 96 8F 57, 4 bytes, Open = 18.82
46 E6 2E 80, 4 bytes, Volume = 2,946,325
00 00 01 16 7A 53 7C 80, 8 bytes, Timestamp = November 26,2007
TERMINATOR
FF FF, 2 bytes,
How to read binary data like this?
Thanks in advance.
Update:
I tried struct module on first 6 bytes with following code:
struct.unpack('ih', response.read(6))
(16777216, 1024)
But it should output (1, 4). I take a look at the manual but have no clue what was wrong.

So here's my best shot at interpreting the data you're giving...:
import datetime
import struct
class Printable(object):
specials = ()
def __str__(self):
resultlines = []
for pair in self.__dict__.items():
if pair[0] in self.specials: continue
resultlines.append('%10s %s' % pair)
return '\n'.join(resultlines)
head_fmt = '>IH6sBH'
head_struct = struct.Struct(head_fmt)
class Header(Printable):
specials = ('bars',)
def __init__(self, symbol_count, symbol_length,
symbol, error_code, bar_count):
self.__dict__.update(locals())
self.bars = []
del self.self
bar_fmt = '>5fQ'
bar_struct = struct.Struct(bar_fmt)
class Bar(Printable):
specials = ('header',)
def __init__(self, header, close, high, low,
open, volume, timestamp):
self.__dict__.update(locals())
self.header.bars.append(self)
del self.self
self.timestamp /= 1000.0
self.timestamp = datetime.date.fromtimestamp(self.timestamp)
def showdata(data):
terminator = '\xff' * 2
assert data[-2:] == terminator
head_data = head_struct.unpack(data[:head_struct.size])
try:
assert head_data[4] * bar_struct.size + head_struct.size == \
len(data) - len(terminator)
except AssertionError:
print 'data length is %d' % len(data)
print 'head struct size is %d' % head_struct.size
print 'bar struct size is %d' % bar_struct.size
print 'number of bars is %d' % head_data[4]
print 'head data:', head_data
print 'terminator:', terminator
print 'so, something is wrong, since',
print head_data[4] * bar_struct.size + head_struct.size, '!=',
print len(data) - len(terminator)
raise
head = Header(*head_data)
for i in range(head.bar_count):
bar_substr = data[head_struct.size + i * bar_struct.size:
head_struct.size + (i+1) * bar_struct.size]
bar_data = bar_struct.unpack(bar_substr)
Bar(head, *bar_data)
assert len(head.bars) == head.bar_count
print head
for i, x in enumerate(head.bars):
print 'Bar #%s' % i
print x
datas = '''
00 00 00 01 00 04 41 4D 54 44 00 00 00 00 02 41
97 33 33 41 99 5C 29 41 90 3D 71 41 91 D7 0A 47
0F C6 14 00 00 01 16 6A E0 68 80 41 93 B4 05 41
97 1E B8 41 90 7A E1 41 96 8F 57 46 E6 2E 80 00
00 01 16 7A 53 7C 80 FF FF
'''
data = ''.join(chr(int(x, 16)) for x in datas.split())
showdata(data)
this emits:
symbol_count 1
bar_count 2
symbol AMTD
error_code 0
symbol_length 4
Bar #0
volume 36806.078125
timestamp 2007-11-22
high 19.1700000763
low 18.0300006866
close 18.8999996185
open 18.2299995422
Bar #1
volume 29463.25
timestamp 2007-11-25
high 18.8899993896
low 18.0599994659
close 18.4629001617
open 18.8199901581
...which seems to be pretty close to what you want, net of some output formatting details. Hope this helps!-)

>>> data
'\x00\x00\x00\x01\x00\x04AMTD\x00\x00\x00\x00\x02A\x9733A\x99\\)A\x90=qA\x91\xd7\nG\x0f\xc6\x14\x00\x00\x01\x16j\xe0h\x80A\x93\xb4\x05A\x97\x1e\xb8A\x90z\xe1A\x96\x8fWF\xe6.\x80\x00\x00\x01\x16zS|\x80\xff\xff'
>>> from struct import unpack, calcsize
>>> scount, slength = unpack("!IH", data[:6])
>>> assert scount == 1
>>> symbol, error_code = unpack("!%dsb" % slength, data[6:6+slength+1])
>>> assert error_code == 0
>>> symbol
'AMTD'
>>> bar_count = unpack("!I", data[6+slength+1:6+slength+1+4])
>>> bar_count
(2,)
>>> bar_format = "!5fQ"
>>> from collections import namedtuple
>>> Bar = namedtuple("Bar", "Close High Low Open Volume Timestamp")
>>> b = Bar(*unpack(bar_format, data[6+slength+1+4:6+slength+1+4+calcsize(bar_format)]))
>>> b
Bar(Close=18.899999618530273, High=19.170000076293945, Low=18.030000686645508, Open=18.229999542236328, Volume=36806.078125, Timestamp=1195794000000L)
>>> import time
>>> time.ctime(b.Timestamp//1000)
'Fri Nov 23 08:00:00 2007'
>>> int(b.Volume*100 + 0.5)
3680608

>>> struct.unpack('ih', response.read(6))
(16777216, 1024)
You are unpacking big-endian data on a little-endian machine. Try this instead:
>>> struct.unpack('!IH', response.read(6))
(1L, 4)
This tells unpack to consider the data in network-order (big-endian). Also, the values of counts and lengths can not be negative, so you should should use the unsigned variants in your format string.

Take a look at the struct.unpack in the struct module.

Use pack/unpack functions from "struct" package. More info here http://docs.python.org/library/struct.html
Bye!

As it was already mentioned, struct is the module you need to use.
Please read its documentation to learn about byte ordering, etc.
In your example you need to do the following (as your data is big-endian and unsigned):
>>> import struct
>>> x = '\x00\x00\x00\x01\x00\x04'
>>> struct.unpack('>IH', x)
(1, 4)

Related

ASCII decoding using Python

I am using Python 2.7 (due to toolchains dependency :-( )
Input String = 'F1 88 52 45 4D 41 2D 33 43 37 38 32 2D 42 42 00 00 00 00 00 00 00 00 00'
Output String = REMA-3C782-BB
Please help me in decoding the ASCII string(in hex) to the Output String as above.
This below code throws error:
BResponse = 'F1 88 52 45 4D 41 2D 33 43 37 38 32 2D 42 42 00 00 00 00 00 00 00 00 00'
BResponse = BResponse.decode('ASCII')

It looks like you're discarding non-ascii characters, you can use string.printable in order to look for characters you don't want in your output.
import string
def parse_response(response):
return ''.join([chr(int(hex_letter, 16))
if chr(int(hex_letter, 16)) in string.printable
else ''
for hex_letter in response.split(' ')])
Making it return what we intended:
BResponse = 'F1 88 52 45 4D 41 2D 33 43 37 38 32 2D 42 42 00 00 00 00 00 00 00 00 00'
print parse_response(BResponse)
REMA-3C782-BB

I would do it following way
cipher = '52 45 4D 41' # this should give REMA
plain = ''.join(['%c' % int(i,16) for i in cipher.split()])
print plain
gives output
REMA
(tested in Python 2.7.18)

How to write Midi File from scratch using python

I'm studying the Midi file specification, right now I'm testing this, which works fine if played by Timidity but it's corrupted for either Garage Band, OS X(The output doesn't play) and Synthesia.
head = '4d 54 68 64'
chunklen = '00 00 00 06'
mformat = '00 01'
ntracks = '00 02'
tickdiv = '00 60'
trackid = '4d 54 72 6b'
eot = '00 ff 2f 00'
makeheader = lambda : " ".join([head,chunklen,mformat,ntracks,tickdiv])
def chunklencalc(notes):
chlen = format(len(notes)*4, 'x')
return " ".join([x for x in re.compile('(.{2})').split("00000000"[len(chlen):] + chlen) if x != ''])
maketrack = lambda notes : " ".join([trackid, chunklencalc(notes)] + notes + [eot])
makestandardquarter = lambda root : f"00 90 {root} 64 60 80 {root} 64"
def createMidi(filename,bytelist):
with open(filename, 'wb') as f:
for e in bytelist.split(" "):
f.write(bytes.fromhex(e))
filename = 'firsttest.mid'
head = makeheader()
notes1 =[
makestandardquarter('3c'),
makestandardquarter('3c'),
makestandardquarter('3c'),
makestandardquarter('3c'),
makestandardquarter('3c'),
makestandardquarter('3c'),
makestandardquarter('3c'),
makestandardquarter('3c'),
]
notes2 =[
makestandardquarter('40'),
makestandardquarter('40'),
makestandardquarter('40'),
makestandardquarter('40'),
makestandardquarter('40'),
makestandardquarter('40'),
makestandardquarter('40'),
makestandardquarter('40'),
]
track1 = maketrack(notes1)
track2 = maketrack(notes2)
createMidi(filename, " ".join([head, track1,track2]))
I Expected a series of quarters in two tracks, got only the first four on only one track.

After taking a deep look with hexdump, and looking at the defined chunk lengths:
first chunk declares to be 0x20(32) bytes long, starting at position 0x17 (23) and ends at 0x5b (91) that means that your chunk lenght calculations are off by 34 bytes.
00000000 4d 54 68 64 00 00 00 06 00 01 00 02 00 60 4d 54 |MThd.........`MT|
00000010 72 6b 00 00 00 20 00 90 3c 64 60 80 3c 64 00 90 |rk... ..<d`.<d..|
00000020 3c 64 60 80 3c 64 00 90 3c 64 60 80 3c 64 00 90 |<d`.<d..<d`.<d..|
*
00000050 3c 64 60 80 3c 64 00 ff 2f 00 4d 54 72 6b 00 00 |<d`.<d../.MTrk..|
00000060 00 20 00 90 40 64 60 80 40 64 00 90 40 64 60 80 |. ..#d`.#d..#d`.|
00000070 40 64 00 90 40 64 60 80 40 64 00 90 40 64 60 80 |#d..#d`.#d..#d`.|
*
000000a0 40 64 00 ff 2f 00 |#d../.|
000000a6
I wrote my own version using struct:
import struct
HEAD_ID = b"\x4d\x54\x68\x64"
TRACK_ID = b"\x4d\x54\x72\x6b"
class HeaderChunk:
def __init__(self, format, ntrack, tickdiv):
self.format = format
self.ntrack = ntrack
self.tickdiv = tickdiv
def dump(self):
payload = struct.pack(">HH2s", self.format, self.ntrack, self.tickdiv)
header = HEAD_ID + struct.pack(">I", len(payload))
return header + payload
class TrackChunk:
"""Represents a track"""
def __init__(self):
self.data = b""
def quarter(self, note):
self.data += b"\x00\x90" + note + b"\x64\x60\x80" + note + b"\x64"
def dump(self):
header = TRACK_ID + struct.pack(">I", len(self.data))
return header + self.data
header = HeaderChunk(1, 2, b"\x00\x60")
first_track = TrackChunk()
for _ in range(8):
first_track.quarter(b"\x3c")
second_track = TrackChunk()
for _ in range(8):
second_track.quarter(b"\x40")
with open("joac-example.mid", "wb") as output:
output.write(header.dump())
output.write(first_track.dump())
output.write(second_track.dump())
It is correctly loaded on garage band

Python: convert hex bytestream to “int16"

So I'm working with incoming audio from Watson Text to Speech. I want to play the sound immediately when data arrives to Python with a websocket from nodeJS.
This is a example of data I'm sending with the websocket:
<Buffer e3 f8 28 f9 fa f9 5d fb 6c fc a6 fd 12 ff b3 00 b8 02 93 04 42 06 5b 07 e4 07 af 08 18 0a 95 0b 01 0d a2 0e a4 10 d7 12 f4 12 84 12 39 13 b0 12 3b 13 ... >
So the data arrives as a hex bytestream and I try to convert it to something that Sounddevice can read/play. (See documentation: The types 'float32', 'int32', 'int16', 'int8' and 'uint8' can be used for all streams and functions.) But how can I convert this?
I already tried something, but when I run my code I only hear some noise, nothing recognizable.
Here you can read some parts of my code:
def onMessage(self, payload, isBinary):
a = payload.encode('hex')
queue.put(a)
After I receive the bytesstream and convert to hex, I try to send the incoming bytestream to Sounddevice:
def stream_audio():
with sd.OutputStream(channels=1, samplerate=24000, dtype='int16', callback=callback):
sd.sleep(int(20 * 1000))
def callback(outdata, frames, time, status):
global reststuff, i, string
LENGTH = frames
while len(reststuff) < LENGTH:
a = queue.get()
reststuff += a
returnstring = reststuff[:LENGTH]
reststuff = reststuff[LENGTH:]
for char in returnstring:
i += 1
string += char
if i % 2 == 0:
print string
outdata[:] = int(string, 16)
string = ""

look at your stream of data:
e3 f8 28 f9 fa f9 5d fb 6c fc a6 fd 12 ff b3 00
b8 02 93 04 42 06 5b 07 e4 07 af 08 18 0a 95 0b
01 0d a2 0e a4 10 d7 12 f4 12 84 12 39 13 b0 12
3b 13
you see here that every two bytes the second one is starting with e/f/0/1 which means near zero (in two's complement).
So that's your most significant bytes, so your stream is little-endian!
you should consider that in your conversion.
If I have more data I would have tested but this is worth some miliseconds!

Write null bytes in a file instead of correct strings

I have a python script that process a data file :
out = open('result/process/'+name+'.res','w')
out.write("source,rssi,lqi,packetId,run,counter\n")
f = open('result/resultat0.res','r')
for ligne in [x for x in f if x != '']:
chaine = ligne.rstrip('\n')
tmp = chaine.split(',')
if (len(tmp) == 6 ):
out.write(','.join(tmp)+"\n")
f.close()
The complete code is here
I use this script on several computers and the behavior is not the same.
On the first computer, with python 2.6.6, the result is what I expect.
However, on the others (python 2.6.6, 3.3.2, 2.7.5) the write method of file object puts null bytes instead of the values I want during the most part of the processing. I get this result :
$ hexdump -C result/process/1.res
00000000 73 6f 75 72 63 65 2c 72 73 73 69 2c 6c 71 69 2c |source,rssi,lqi,|
00000010 70 61 63 6b 65 74 49 64 2c 72 75 6e 2c 63 6f 75 |packetId,run,cou|
00000020 6e 74 65 72 0a 00 00 00 00 00 00 00 00 00 00 00 |nter............|
00000030 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
0003a130 00 00 00 00 00 00 00 00 00 00 31 33 2c 36 35 2c |..........13,65,|
0003a140 31 34 2c 38 2c 39 38 2c 31 33 31 34 32 0a 31 32 |14,8,98,13142.12|
0003a150 2c 34 37 2c 31 37 2c 38 2c 39 38 2c 31 33 31 34 |,47,17,8,98,1314|
0003a160 33 0a 33 2c 34 35 2c 31 38 2c 38 2c 39 38 2c 31 |3.3,45,18,8,98,1|
0003a170 33 31 34 34 0a 31 31 2c 38 2c 32 33 2c 38 2c 39 |3144.11,8,23,8,9|
0003a180 38 2c 31 33 31 34 35 0a 39 2c 32 30 2c 32 32 2c |8,13145.9,20,22,|
Have you an idea how to resolve this problem please ?

With the following considerations:
In over a decade of programming python, I've never come across a compelling reason to use global. Pass arguments to functions instead.
For ensuring files are closed when finished with, use the with statement.
Here's an (untested) attempt at refactoring your code for sanity, assumes that you have enough memory available to hold all of the lines under a particular identifier.
If you have null bytes in your result files after this refactoring then we have reasonable basis to proceed with debugging.
import os
import re
from contextlib import closing
def list_files_to_process(directory='results'):
"""
Return a list of files from directory where the file extension is '.res',
case insensitive.
"""
results = []
for filename in os.listdir(directory):
filepath = os.path.join(directory,filename)
if os.path.isfile(filepath) and filename.lower().endswith('.res'):
results.append(filepath)
return results
def group_lines(sequence):
"""
Generator, process a sequence of lines, separated by a particular line.
Yields batches of lines along with the id from the separator.
"""
separator = re.compile('^A:(?P<id>\d+):$')
batch = []
batch_id = None
for line in sequence:
if not line: # Ignore blanks
continue
m = separator.match(line):
if m is not None:
if batch_id is not None or len(batch) > 0:
yield (batch_id,batch)
batch_id = m.group('id')
batch = []
else:
batch.append(line)
if batch_id is not None or len(batch) > 0:
yield (batch_id,batch)
def filename_for_results(batch_id,result_directory):
"""
Return an appropriate filename for a batch_id under the result directory
"""
return os.path.join(result_directory,"results-%s.res" % (batch_id,))
def open_result_file(filename,header="source,rssi,lqi,packetId,run,counter"):
"""
Return an open file object in append mode, having appended a header if
filename doesn't exist or is empty
"""
if os.path.exists(filename) and os.path.getsize(filename) > 0:
# No need to write header
return open(filename,'a')
else:
f = open(filename,'a')
f.write(header + '\n')
return f
def process_file(filename,result_directory='results/processed'):
"""
Open filename and process it's contents. Uses group_lines() to group
lines into different files based upon specific line acting as a
content separator.
"""
error_filename = filename_for_results('error',result_directory)
with open(filename,'r') as in_file, open(error_filename,'w') as error_out:
for batch_id, lines in group_lines(in_file):
if len(lines) == 0:
error_out.write("Received batch %r with 0 lines" % (batch_id,))
continue
out_filename = filename_for_results(batch_id,result_directory)
with closing(open_result_file(out_filename)) as out_file:
for line in lines:
if line.startswith('L') and line.endswith('E') and line.count(',') == 5:
line = line.lstrip('L').rstrip('E')
out_file.write(line + '\n')
else:
error_out.write("Unknown line, batch=%r: %r\n" %(batch_id,line))
if __name__ == '__main__':
files = list_files_to_process()
for filename in files:
print "Processing %s" % (filename,)
process_file(filename)

Parse WAV file header

I am writing a program to parse a WAV file header and print the information to the screen. Before writing the program i am doing some research
hexdump -n 48 sound_file_8000hz.wav
00000000 52 49 46 46 bc af 01 00 57 41 56 45 66 6d 74 20 |RIFF....WAVEfmt |
00000010 10 00 00 00 01 00 01 00 >40 1f 00 00< 40 1f 00 00 |........#...#...|
00000020 01 00 08 00 64 61 74 61 98 af 01 00 81 80 81 80 |....data........|
hexdump -n 48 sound_file_44100hz.wav
00000000 52 49 46 46 c4 ea 1a 00 57 41 56 45 66 6d 74 20 |RIFF....WAVEfmt |
00000010 10 00 00 00 01 00 02 00 >44 ac 00 00< 10 b1 02 00 |........D.......|
00000020 04 00 10 00 64 61 74 61 a0 ea 1a 00 00 00 00 00 |....data........|
The part between > and < in both files are the sample rate.
How does "40 1f 00 00" translate to 8000Hz and "44 ac 00 00" to 44100Hz? Information like number of channels and audio format can be read directly from the dump. I found a Python
script called WavHeader that parses the sample rate correctly in both files. This is the core of the script:
bufHeader = fileIn.read(38)
# Verify that the correct identifiers are present
if (bufHeader[0:4] != "RIFF") or \
(bufHeader[12:16] != "fmt "):
logging.debug("Input file not a standard WAV file")
return
# endif
stHeaderFields = {'ChunkSize' : 0, 'Format' : '',
'Subchunk1Size' : 0, 'AudioFormat' : 0,
'NumChannels' : 0, 'SampleRate' : 0,
'ByteRate' : 0, 'BlockAlign' : 0,
'BitsPerSample' : 0, 'Filename': ''}
# Parse fields
stHeaderFields['ChunkSize'] = struct.unpack('<L', bufHeader[4:8])[0]
stHeaderFields['Format'] = bufHeader[8:12]
stHeaderFields['Subchunk1Size'] = struct.unpack('<L', bufHeader[16:20])[0]
stHeaderFields['AudioFormat'] = struct.unpack('<H', bufHeader[20:22])[0]
stHeaderFields['NumChannels'] = struct.unpack('<H', bufHeader[22:24])[0]
stHeaderFields['SampleRate'] = struct.unpack('<L', bufHeader[24:28])[0]
stHeaderFields['ByteRate'] = struct.unpack('<L', bufHeader[28:32])[0]
stHeaderFields['BlockAlign'] = struct.unpack('<H', bufHeader[32:34])[0]
stHeaderFields['BitsPerSample'] = struct.unpack('<H', bufHeader[34:36])[0]
I do not understand how this can extract the corret sample rates, when i cannot using hexdump?
I am using information about the WAV file format from this page:
https://ccrma.stanford.edu/courses/422/projects/WaveFormat/

The "40 1F 00 00" bytes equate to an integer whose hexadecimal value is 00001F40 (remember that the integers are stored in a WAVE file in the little endian format). A value of 00001F40 in hexadecimal equates to a decimal value of 8000.
Similarly, the "44 AC 00 00" bytes equate to an integer whose hexadecimal value is 0000AC44. A value of 0000AC44 in hexadecimal equates to a decimal value of 44100.

They're little-endian.
>>> 0x00001f40
8000
>>> 0x0000ac44
44100

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python binary data reading - python

Take a look at the struct.unpack in the struct module.

Use pack/unpack functions from "struct" package. More info here http://docs.python.org/library/struct.html Bye!

Related

ASCII decoding using Python

How to write Midi File from scratch using python

Python: convert hex bytestream to “int16"

Write null bytes in a file instead of correct strings

Parse WAV file header

Categories

Resources