Read data from binary file python - python

I have a binary file with this format:
and i use this code to open it:
import numpy as np
f = open("author_1", "r")
dt = np.dtype({'names': ['au_id','len_au_name','au_name','nu_of_publ', 'pub_id', 'len_of_pub_id','pub_title','num_auth','len_au_name_1', 'au_name1','len_au_name_2', 'au_name2','len_au_name_3', 'au_name3','year_publ','num_of_cit','citid','len_cit_tit','cit_tit', 'num_of_au_cit','len_cit_au_name_1','au_cit_name_1', len_cit_au_name_2',
'au_cit_name_2','len_cit_au_name_3','au_cit_name_3','len_cit_au_name_4',
'au_cit_name_4', 'len_cit_au_name_5','au_cit_name_5','year_cit'],
'formats': [int,int,'S13',int,int,int,'S61', int,int,'S8',int,'S7',int,'S12',int,int,int,int,'S50',int,int,
'S7',int,'S7',int,'S9',int,'S8',int,'S1',int]})
a = np.fromfile(f, dtype=dt, count=-1, sep="")
And I take this:
array([ (1, 13, b'Scott Shenker', 200, 1, 61, b'Integrated services in the internet architecture: an overview', 3, 8, b'R Braden', 7, b'D Clark', 12, b'S Shenker\xe2\x80\xa6', 1994, 1000, 401, 50, b'[HTML] An architecture for differentiated services', 5, 7, b'D Black', 7, b'S Blake', 9, b'M Carlson', 8, b'E Davies', 1, b'Z', 1998),
(402, 72, b'Resource rese', 1952544370, 544108393, 1953460848, b'ocol (RSVP)--Version 1 functional specification\x05\x00\x00\x00\x08\x00\x00\x00R Brad', 487013, 541851648, b'Zhang\x08', 1109414656, b'erson\x08', 542310400, b'Herzog\x07\x00\x00\x00S ', 1768776010, 511342, 103168, 22016, b'\x00A reliable multicast framework for light-weight s', 1769173861, 544435823, b'and app', 1633905004, b'tion le', 543974774, b'framing\x04', 458752, b'\x00\x00S Floy', 2660, b'', 1632247894),
Any idea how can open the whole file?

I agree with Ryan: parsing the data is straightforward, but not trivial, and really tedious. Whatever disk space saving you gain by packing the data in this way, you pay it dearly at the hour of unpacking.
Anyway, the file is made of variable length records and fields. Each record is made of variable number and length of fields that we can read in chunks of bytes. Each chunk will have different format. You get the idea. Following this logic, I assembled these three functions, that you can finish, modify, test, etc:
from struct import Struct
import struct
def read_chunk(fmt, fileobj):
chunk_struct = Struct(fmt)
chunk = fileobj.read(chunk_struct.size)
return chunk_struct.unpack(chunk)
def read_record(fileobj):
author_id, len_author_name = read_chunk('ii', f)
author_name, nu_of_publ = read_chunk(str(len_author_name)+'ci', f) # 's' or 'c' ?
record = { 'author_id': author_id,
'author_name': author_name,
'publications': [] }
for pub in range(nu_of_publ):
pub_id, len_pub_title = read_chunk('ii', f)
pub_title, num_pub_auth = read_chunk(str(len_pub_title)+'ci', f)
record['publications'].append({
'publication_id': pub_id,
'publication_title': pub_title,
'publication_authors': [] })
for auth in range(num_pub_auth):
len_pub_auth_name = read_chunk('i', f)
pub_auth_name = read_chunk(str(len_pub_auth_name)+'c', f)
record['publications']['publication_authors'].append({'name': pub_auth_name})
year_publ, nu_of_cit = read_chunk('ii', f)
# Finish building your record with the remaining fields...
for cit in range(nu_of_cit):
cit_id, len_cit_title = read_chunk('ii', f)
cit_title, num_cit_auth = read_chunk(str(len_cit_title)+'ci', f)
for cit_auth in range(num_cit_auth):
len_cit_auth_name = read_chunk('i', f)
cit_auth_name = read_chunk(str(len_cit_auth_name)+'c', f)
year_cit_publ = read_chunk('i', f)
return record
def parse_file(filename):
records = []
with open(filename, 'rb') as f:
while True:
try:
records.append(read_record(f))
except struct.error:
break
# do something useful with the records...

The data structure stored in this file is hierarchical, rather than "flat": child arrays of different length are stored within each parent element. It is not possible to represent such a data structure using numpy arrays (even recarrays), and therefore it is not possible to read the file with np.fromfile().
What do you mean by "open the whole file"? What sort of python data structure would you like to end up with?
It would be straightforward, but still not trivial, to write a function to parse the file into a list of dictionaries.

Related

Adding more attributes to json

Currently, I store just the device_id of a certain object with an assigned sACN universe. The function looks like this:
for device_index in range(device_count):
device_name = sdk.get_device_info(device_index)
device_type = device_name.type
if device_name.id not in conf:
universe = get_free_universe()
conf.update({device_name.id: universe})
print(f"conf= {conf}")
else:
universe = conf[device_name.id] #main.py line 368
save_config(DEVICE_PATH)
Which produces this JSON:
{
"3192efa109cfb5d86f09a82a7cc00c5d": 4,
"42aa42a0bb5fcee780fb1be13dfcb873": 5,
"4b80e1817076307b36c58c31118f6696": 1,
"62c13e2db726382e9c66d9f69020ab5e": 6,
"a51da6fe155f299a3fc474c22310cde9": 2,
"b5ff59af43d6c3572a41d7693b5bec1c": 3
}
Now I wanna store together with the device_id not only the universe, but its device_name.model attribute, kinda in this format:
{
"3192efa109cfb5d86f09a82a7cc00c5d":
"universe": 4,
"model": "Vengance RGB PRO"
"42aa42a0bb5fcee780fb1be13dfcb873":
"universe": 5,
"model": "M65 PRO"
}
I have absolutely no clue how to do it, afaik my conf is a python dict where I cant do something like this and I need to use python lists. I define it like this: conf = load_config(DEVICE_PATH), the function load_config()looks like this:
def load_config(config_path):
if not os.path.isfile(config_path): #Create the file if not present
open(config_path, "w+")
if config_path == MQTT_PATH:
with open(config_path) as f: #load the config file
try:
return json.load(f)
except json.JSONDecodeError:
data = {}
data['enable_MQTT'] = True
data['ip'] =""
data['port']= 1883
data['username'] =""
data['password'] =""
data['base_topic'] =""
with open(config_path, "w", encoding="utf-8") as f: # Save config
json.dump(
data,
f,
ensure_ascii=False,
sort_keys=False,
indent=4
)
print(f"MQTT Config Created, please edit {MQTT_PATH} and restart this program!")
print("For Home Assistant Auto Discovery, set base_topic to homeassistant!")
sys.exit()
with open(config_path) as f: #load the config file
try:
return json.load(f)
except json.JSONDecodeError:
return {}
When I try to convert the list via conf = list(conf), i just get the runtime error
File "c:\Users\tenn0\Documents\Projects\iCue2sACN-mqtt\src\main.py", line 368, in <module>
universe = conf[device_name.id]
TypeError: list indices must be integers or slices, not str
Ive marked in the first snipped the correct line
Edit: As pointed out in the comments, i can simply do a dict inside a dict. conf.update({device_name.id: {"model": device_name.model, "universe": universe}}) did the trick for me.

Read from binary file with Python 3.5

I use this piece of code:
from struct import Struct
import struct
def read_chunk(fmt, fileobj):
chunk_struct = Struct(fmt)
chunk = fileobj.read(chunk_struct.size)
return chunk_struct.unpack(chunk)
def read_record(fileobj):
author_id, len_author_name = read_chunk('ii', f)
author_name, nu_of_publ = read_chunk(str(len_author_name)+'si', f) # 's' or 'c' ?
record = { 'author_id': author_id,
'author_name': author_name,
'publications': [] }
for pub in range(nu_of_publ):
pub_id, len_pub_title = read_chunk('ii', f)
pub_title, num_pub_auth = read_chunk(str(len_pub_title)+'si', f)
record['publications'].append({
'publication_id': pub_id,
'publication_title': pub_title,
'publication_authors': [] })
for auth in range(num_pub_auth):
len_pub_auth_name = read_chunk('i', f)
pub_auth_name = read_chunk(str(len_pub_auth_name)+'s', f)
record['publications']['publication_authors'].append({'name': pub_auth_name})
year_publ, nu_of_cit = read_chunk('ii', f)
# Finish building your record with the remaining fields...
for cit in range(nu_of_cit):
cit_id, len_cit_title = read_chunk('ii', f)
cit_title, num_cit_auth = read_chunk(str(len_cit_title)+'si', f)
for cit_auth in range(num_cit_auth):
len_cit_auth_name = read_chunk('i', f)
cit_auth_name = read_chunk(str(len_cit_auth_name)+'s', f)
year_cit_publ = read_chunk('i', f)
return record
def parse_file(filename):
records = []
with open(filename, 'rb') as f:
while True:
try:
records.append(read_record(f))
except struct.error:
break
to read this file:
https://drive.google.com/open?id=0B3SYAHrxLP69NHlWc25KeXFHNVE
with this format:
Inside the function read_record, it read correct variables author_id, len_author_name, author_name but the nu_of_publ and below variables aren't read correct.
Any idea what's wrong?
When i run this piece of code:
author_id, len_author_name = read_chunk('LL', f)
author_name, nu_of_publ= read_chunk(str(len_author_name)+'sL', f)
#nu_of_publ = read_chunk('I', f)# 's' or 'c' ?
record = { 'author_id': author_id,
'author_name': author_name,
'publications': [] }
print (record, nu_of_publ)
for pub in range(nu_of_publ):
pub_id, len_pub_title = read_chunk('LL', f)
print (pub_id, len_pub_title)
i take this result:
{'author_name': b'Scott Shenker', 'author_id': 1, 'publications': []} 256
15616 1953384704
but it will print 200 instead 256, 1 instead 15616 etc.
This format is not correct:
author_name, nu_of_publ = read_chunk(str(len_author_name)+'si', f)
You are defining a structure of N characters and an integer. Those structures are aligned, the same way as they would if you had the structure defined in c:
struct {
char author_name[N];
int nu_of_publ;
};
What alignment does is: it puts beginning of every int to a position which is a multiple of 4. This is done (in C) because CPUs are optimized for accessing such addresses.
So, if author's name's length is 6, the next two bytes will be skipped before reading the next integer.
One solution to separate the structures:
author_name = read_chunk(str(len_author_name)+'s', f)
nu_of_publ, = read_chunk('i', f)
Note: The comma after nu_of_publ (nu_of_publ,) is to unpack the tuple returned by read_chunk.
Another solution is to specify structure with = at the beginning, based on the table from spec:
author_name, nu_of_publ = read_chunk('={}si'.format(len_author_name), f)

How to open a csv file for reading purpose using mmap in python?

I want to open csv file for reading purpose. But I'm facing some exceptions regarding to that.
I'm using Python 2.7.
main.python-
if __name__ == "__main__":
f = open('input.csv','r+b')
m = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
reader = csv.DictReader(iter(m.readline, ""))
for read in reader:
num = read['time']
print num
output-
Traceback (most recent call last):
File "/home/PycharmProjects/time_gap_Task/main.py", line 22, in <module>
for read in reader:
File "/usr/lib/python3.4/csv.py", line 109, in __next__
self.fieldnames
File "/usr/lib/python3.4/csv.py", line 96, in fieldnames
self._fieldnames = next(self.reader)
_csv.Error: iterator should return strings, not bytes (did you open the file in text mode?)
How to resolve this error? and how to open csv file using mmap and csv in good manner so code is working perfect?
I know you asked this a while ago, but I actually created a module for myself that does this, because I do a lot of work with large CSV files, and sometimes I need to convert them into dictionaries, based on a key. Below is the code I've been using. Please feel free to modify as needed.
def MmapCsvFileIntoDict(csvFilePath, skipHeader = True, transform = lambda row: row, keySelector = lambda o: o):
"""
Takes a CSV file path and uses mmap to open the file and return a dictionary of the contents keyed
on the results of the keySelector. The default key is the transformed object itself. Mmap is used because it is
a more efficient way to process large files.
The transform method is used to convert the line (converted into a list) into something else. Hence 'transform'.
If you don't pass it in, the transform returns the list itself.
"""
contents = {}
firstline = False
try:
with open(csvFilePath, "r+b") as f:
# memory-map the file, size 0 means whole file
mm = mmap.mmap(f.fileno(), 0)
for line in iter(mm.readline, b''):
if firstline == False:
firstline = True
if skipHeader == True:
continue
row = ''
line = line.decode('utf-8')
line = line.strip()
row = next(csv.reader([line]), '')
if transform != None and callable(transform):
if row == None or row == '':
continue
value = transform(row)
else:
value = row
if callable(keySelector):
key = keySelector(value)
else:
key = keySelector
contents[key] = value
except IOError as ie:
PrintWithTs('Error decomposing the companies: {0}'.format(ie))
return {}
except:
raise
return contents
When you call this method, you have some options.
Assume you have a file that looks like:
Id, Name, PhoneNumber
1, Joe, 7175551212
2, Mary, 4125551212
3, Vince, 2155551212
4, Jane, 8145551212
The easiest way to call it is like this:
dict = MmapCsvFileIntoDict('/path/to/file.csv', keySelector = lambda row: row[0])
What you get back is a dict looking like this:
{ '1' : ['1', 'Joe', '7175551212'], '2' : ['2', 'Mary', '4125551212'] ...
One thing I like to do is create a class or a namedtuple to represent my data:
class CsvData:
def __init__(self, row):
self.Id = int(row[0])
self.Name = row[1].upper()
self.Phone = int(row[2])
And then when I call the method, I pass in a second lambda to transform each row in the file to an object I can work with:
dict = MmapCsvFileIntoDict('/path/to/file.csv', transform = lambda row: CsvData(row), keySelector = lambda o: o.Id)
What I get back that time looks like:
{ 1 : <object instance>, 2 : <object instance>...
I hope this helps! Best of luck
When open a file with the flag b like this:
f = open('input.csv','r+b')
You read the file as bytes and not as string.
So, try to change the flags to r:
f = open('input.csv','r')
if you just want to read data with specific columnes from csv file, just try:
import csv
with open('input.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
print row['time']

Append JSON to file

I am trying to append values to a json file. How can i append the data? I have been trying so many ways but none are working ?
Code:
def all(title,author,body,type):
title = "hello"
author = "njas"
body = "vgbhn"
data = {
"id" : id,
"author": author,
"body" : body,
"title" : title,
"type" : type
}
data_json = json.dumps(data)
#data = ast.literal_eval(data)
#print data_json
if(os.path.isfile("offline_post.json")):
with open('offline_post.json','a') as f:
new = json.loads(f)
new.update(a_dict)
json.dump(new,f)
else:
open('offline_post.json', 'a')
with open('offline_post.json','a') as f:
new = json.loads(f)
new.update(a_dict)
json.dump(new,f)
How can I append data to json file when this function is called?
I suspect you left out that you're getting a TypeError in the blocks where you're trying to write the file. Here's where you're trying to write:
with open('offline_post.json','a') as f:
new = json.loads(f)
new.update(a_dict)
json.dump(new,f)
There's a couple of problems here. First, you're passing a file object to the json.loads command, which expects a string. You probably meant to use json.load.
Second, you're opening the file in append mode, which places the pointer at the end of the file. When you run the json.load, you're not going to get anything because it's reading at the end of the file. You would need to seek to 0 before loading (edit: this would fail anyway, as append mode is not readable).
Third, when you json.dump the new data to the file, it's going to append it to the file in addition to the old data. From the structure, it appears you want to replace the contents of the file (as the new data contains the old data already).
You probably want to use r+ mode, seeking back to the start of the file between the read and write, and truncateing at the end just in case the size of the data structure ever shrinks.
with open('offline_post.json', 'r+') as f:
new = json.load(f)
new.update(a_dict)
f.seek(0)
json.dump(new, f)
f.truncate()
Alternatively, you can open the file twice:
with open('offline_post.json', 'r') as f:
new = json.load(f)
new.update(a_dict)
with open('offline_post.json', 'w') as f:
json.dump(new, f)
This is a different approach, I just wanted to append without reloading all the data. Running on a raspberry pi so want to look after memory. The test code -
import os
json_file_exists = 0
filename = "/home/pi/scratch_pad/test.json"
# remove the last run json data
try:
os.remove(filename)
except OSError:
pass
count = 0
boiler = 90
tower = 78
while count<10:
if json_file_exists==0:
# create the json file
with open(filename, mode = 'w') as fw:
json_string = "[\n\t{'boiler':"+str(boiler)+",'tower':"+str(tower)+"}\n]"
fw.write(json_string)
json_file_exists=1
else:
# append to the json file
char = ""
boiler = boiler + .01
tower = tower + .02
while(char<>"}"):
with open(filename, mode = 'rb+') as f:
f.seek(-1,2)
size=f.tell()
char = f.read()
if char == "}":
break
f.truncate(size-1)
with open(filename, mode = 'a') as fw:
json_string = "\n\t,{'boiler':"+str(boiler)+",'tower':"+str(tower)+"}\n]"
fw.seek(-1, os.SEEK_END)
fw.write(json_string)
count = count + 1

Create CSV file from read Data from def main()

I need to know that it's possible to create CSV file by giving an input not from an other CSV.
I have a python script that allow me to read Data Memory from plc giving me a print output.
I need to know if it's possible to create CSV file by giving an input like this:
def main( ):
plc = OmronPLC( )
print plc.openFins('ip_address', port)
print plc.readMemC('D2000', 2)
print plc.readMemC('D2005', 5)
plc.close ( )
if __name__ == "__main__":
main()
import csv
with open ('test.csv', 'w') as fp:
a = csv.writer(fp, delimiter=',')
data = [ main( ) ]
a.writerows(data)
That's a piece of script that read Data Memory from a plc.
When launch this script in monitor giving me the output data, but when try to create a CSV File the output in csv file is ""
From plc read the values ​​of the data memory expressed in numbers, then I try to create a CSV like this:
12, 3
25, 54
44, 555
The output when script print to monitor value is like this:
[12, 3,]
[12, 4, 54, 44, 555]
How can I have input data from that definition?
The main() function as you have written should return the list of values to be written to the CSV file. For e.g.
def main( ):
plc = OmronPLC( )
ip = plc.openFins('ip_address', port)
d2000 = plc.readMemC('D2000', 2)
d2005 = plc.readMemC('D2005', 5)
plc.close ( )
print ip, d2000, d2005
return [ip, d2000, d2005]

Categories

Resources