Read from binary file with Python 3.5 - python

I use this piece of code:
from struct import Struct
import struct
def read_chunk(fmt, fileobj):
chunk_struct = Struct(fmt)
chunk = fileobj.read(chunk_struct.size)
return chunk_struct.unpack(chunk)
def read_record(fileobj):
author_id, len_author_name = read_chunk('ii', f)
author_name, nu_of_publ = read_chunk(str(len_author_name)+'si', f) # 's' or 'c' ?
record = { 'author_id': author_id,
'author_name': author_name,
'publications': [] }
for pub in range(nu_of_publ):
pub_id, len_pub_title = read_chunk('ii', f)
pub_title, num_pub_auth = read_chunk(str(len_pub_title)+'si', f)
record['publications'].append({
'publication_id': pub_id,
'publication_title': pub_title,
'publication_authors': [] })
for auth in range(num_pub_auth):
len_pub_auth_name = read_chunk('i', f)
pub_auth_name = read_chunk(str(len_pub_auth_name)+'s', f)
record['publications']['publication_authors'].append({'name': pub_auth_name})
year_publ, nu_of_cit = read_chunk('ii', f)
# Finish building your record with the remaining fields...
for cit in range(nu_of_cit):
cit_id, len_cit_title = read_chunk('ii', f)
cit_title, num_cit_auth = read_chunk(str(len_cit_title)+'si', f)
for cit_auth in range(num_cit_auth):
len_cit_auth_name = read_chunk('i', f)
cit_auth_name = read_chunk(str(len_cit_auth_name)+'s', f)
year_cit_publ = read_chunk('i', f)
return record
def parse_file(filename):
records = []
with open(filename, 'rb') as f:
while True:
try:
records.append(read_record(f))
except struct.error:
break
to read this file:
https://drive.google.com/open?id=0B3SYAHrxLP69NHlWc25KeXFHNVE
with this format:
Inside the function read_record, it read correct variables author_id, len_author_name, author_name but the nu_of_publ and below variables aren't read correct.
Any idea what's wrong?
When i run this piece of code:
author_id, len_author_name = read_chunk('LL', f)
author_name, nu_of_publ= read_chunk(str(len_author_name)+'sL', f)
#nu_of_publ = read_chunk('I', f)# 's' or 'c' ?
record = { 'author_id': author_id,
'author_name': author_name,
'publications': [] }
print (record, nu_of_publ)
for pub in range(nu_of_publ):
pub_id, len_pub_title = read_chunk('LL', f)
print (pub_id, len_pub_title)
i take this result:
{'author_name': b'Scott Shenker', 'author_id': 1, 'publications': []} 256
15616 1953384704
but it will print 200 instead 256, 1 instead 15616 etc.

This format is not correct:
author_name, nu_of_publ = read_chunk(str(len_author_name)+'si', f)
You are defining a structure of N characters and an integer. Those structures are aligned, the same way as they would if you had the structure defined in c:
struct {
char author_name[N];
int nu_of_publ;
};
What alignment does is: it puts beginning of every int to a position which is a multiple of 4. This is done (in C) because CPUs are optimized for accessing such addresses.
So, if author's name's length is 6, the next two bytes will be skipped before reading the next integer.
One solution to separate the structures:
author_name = read_chunk(str(len_author_name)+'s', f)
nu_of_publ, = read_chunk('i', f)
Note: The comma after nu_of_publ (nu_of_publ,) is to unpack the tuple returned by read_chunk.
Another solution is to specify structure with = at the beginning, based on the table from spec:
author_name, nu_of_publ = read_chunk('={}si'.format(len_author_name), f)

Related

Python lambda expression - string indices must be integers

I am trying to use data from one json file to update another. In trying to be efficient for searching, I'm attempting to use a lambda expression to locate the correct record to update.
The goal is to be able to most efficiently update the "PreviousMappings" key.
Code:
for p in Path('jobs/data').glob('*.json'):
with open(p, 'r') as f:
print('Loaded - ', f.name)
jdata = json.load(f)
for j in jdata['Mappings']:
jc = j['Job Code']
with open('newdata/jobs.json', 'r+') as s:
print('Loaded - ', s.name)
data = json.load(s)
found = list(filter(lambda x:x['Jobcode'] == jc, data)) #throws exception
JSON:
{
"JobCodes": [
{
"Bid Code": "123-456",
"Description": "JOB DESCRIPTION",
"Jobcode": "ABC123",
"PreviousMappings": ""
}
]
}
This does what you're asking, but you might consider a different approach.
data = json.load( open('newdata/jobs.json', 'r') )
for p in Path('jobs/data').glob('*.json'):
with open(p, 'r') as f:
print('Loaded - ', f.name)
jdata = json.load(f)
for j in jdata['Mappings']:
jc = j['Job Code']
for k in data:
if k['Jobcode'] == jc:
k['PreviousMappings'] = "something"
break
json.dump( open('newdata/jobs.json','w'), data )
If you have a LOT of files, you might consider building an index, so you can do a direct lookup. For example (untested):
data = json.load( open('newdata/jobs.json', 'r') )
dindex = {}
for k in data:
dindex[k['Jobcode']] = k
Now you don't have to search -- Python will do it:
for j in jdata['Mappings']:
jc = j['Job Code']
if jc in dindex:
dindex[jc]['PreviousMappings'] = "something"

How to Change dictionary values in python file from another file

I would like to change values in a Dict in another file. File1.py contains the code to edit the Dict, File2.py contains the Dict itself.
File1.py is generating a code to replace BTOK values only.
File1.py:
with open('file2.py', 'r') as file :
filedata = file.read()
print (filedata.str(BTK['btk1']))
for line in filedata:
line['btk1'] = BTok
with open('file2.py', 'w') as file:
file.write(line)
File2.py:
c = {
'id' : 'C80e3ce43c3ea3e8d1511ec',
'secret' : 'c10c371b4641010a750073925b0857'
}
rk = {
't1' : 'ZTkwMGE1MGEt',
}
BTK = {
'BTok' : '11eyJhbGc'
}
If you want to do this reliably, that is, so it works whether your strings are quoted with ', " or """, for whatever values they have and whatever newlines you want to put around values, then you may want to use ast to parse the source code and modify it. The only inconvenient with this is that module cannot, by itself, generate code, so you would need to install some additional dependency such as astor, for what is essentially a rather menial task. In any case, here is how you could do it that way:
import ast
import astor
# To read from file:
# with open('file2.py', 'r') as f: code = f.read()
code = """
c = {
'id' : 'C80e3ce43c3ea3e8d1511ec',
'secret' : 'c10c371b4641010a750073925b0857'
}
rk = {
't1' : 'ZTkwMGE1MGEt',
}
BTK = {
'BTok' : '11eyJhbGc'
}
"""
# Value to replace
KEY = 'BTok'
NEW_VALUE = 'new_btok'
# Parse code
m = ast.parse(code)
# Go through module statements
for stmt in m.body:
# Only look at assignments
if not isinstance(stmt, ast.Assign): continue
# Take right-hand side of the assignment
value = stmt.value
# Only look at dict values
if not isinstance(value, ast.Dict): continue
# Look for keys that match what we are looking for
replace_idx = [i for i, k in enumerate(value.keys)
if isinstance(k, ast.Str) and k.s == KEY]
# Replace corresponding values
for i in replace_idx:
value.values[i] = ast.Str(NEW_VALUE)
new_code = astor.to_source(m)
# To write to file:
# with open(`file2.py', 'w') as f: f.write(new_code)
print(new_code)
# c = {'id': 'C80e3ce43c3ea3e8d1511ec', 'secret':
# 'c10c371b4641010a750073925b0857'}
# rk = {'t1': 'ZTkwMGE1MGEt'}
# BTK = {'BTok': 'new_btok'}

Converting data to JSON in same format as CSV

I have the following code which prints the object as CSV:
title = ['Username', 'Name', 'Job']
for x in title:
print(x, end =",")
for d in data:
line = d.get_username() + "," + d.get_name() + "," + d.get_role()
print(line)
I get:
Username,Name,Job
rob,robert,developer
danny21,danny,developer
I want to print the same data as JSON in order to get:
[
{
"Username":"rob",
"Name":"robert",
"Job":"developer"
},
{
"Username":"danny21",
"Name":"danny",
"Job":"developer"
}
]
From previous topics I learn that we can use json.dumps but I'm not sure if it helps in this case.
What is the proper way to achieve it?
You could simply do:
l = []
for d in data:
user_dictionary = {}
user_dictionary[title[0]] = d.get_username()
user_dictionary[title[1]] = d.get_name()
user_dictionary[title[2]] = d.get_role()
l.append(user_dictionary)
to get a json like file.
You can also avoid appending and do:
def get_user_data(user):
user_dictionary = {}
user_dictionary[title[0]] = d.get_username()
user_dictionary[title[1]] = d.get_name()
user_dictionary[title[2]] = d.get_role()
return user_dictionary
l = list(map(get_user_data, data))
You can use json.dump to dump l in a file
import json
with open('data.json', 'w') as outfile:
json.dump(l, outfile)

Lambda fn to get s3 file , use it to change another s3 file then rewrite it to s3

This is a python code that I used to manipulate a file table1 using a reference file pds_ref
So pds_ref looks like this :
|THE_TABLE|THE_KEY
|table1|3
|table1|1
table1 looks like this
|ID|NAME
|1|Imran
|2|Peter
|3|Pedro
|4|Carlos
The idea is to use the references in pds_ref to remove the records in whatever table is being listed and its corresponding key...in this case 1 and 3 are to deleted
This python code works just as python
import csv
with open("pds_ref","rb") as ref_file:
refreader=csv.DictReader(ref_file, delimiter='|')
reftable=[row for row in refreader]
refheader = refreader.fieldnames
for refrow in reftable:
print refrow['THE_TABLE']
print refrow['THE_KEY']
with open(refrow['THE_TABLE'], "rbw") as infile:
reader = csv.DictReader(infile, delimiter='|')
table = [row for row in reader]
header = reader.fieldnames
with open(refrow['THE_TABLE'], "wb") as outfile:
writer = csv.DictWriter(outfile, header,delimiter='|')
writer.writeheader()
for row in table:
if row['ID'] != refrow['THE_KEY'] :
writer.writerow(row)
Now, I want to do this using lambda such that the function is triggered evertime someone uploads the pds_ref file
I got as far as being able to get the pds_ref file and read each line but having trouble doing the equivalent of opening and writing back the amended table1 file. Any help appreciated.
import boto3
import csv
import io
def lambda_handler(event, context):
s3 = boto3.client("s3")
if event:
print ("Event : ", event)
file_obj = event["Records"][0]
filename = str(file_obj['s3']['object']['key'])
bucketname = str(file_obj['s3']['bucket']['name'])
print("Filename: ",filename)
print("Bucket: ",bucketname)
fileObj = s3.get_object(Bucket= "lambda-trig1",Key=filename)
print ("fileObj: ",fileObj)
file_content = fileObj["Body"].read().decode('utf-8')
print(file_content)
f_pds_ref = s3.get_object(Bucket= "lambda-trig1",Key='pds_ref')
fc_pds_ref = f_pds_ref['Body'].read().decode('utf-8').splitlines(True)
for refrow in csv.DictReader(fc_pds_ref,delimiter='|'):
print refrow['THE_TABLE']
print refrow['THE_KEY']
current_table = refrow['THE_TABLE']
current_key = refrow['THE_KEY']
f_the_next_table = s3.get_object(Bucket= "lambda-trig1",Key=current_table)
fc_the_next_table = f_the_next_table['Body'].read().decode('utf-8').splitlines(True)
with open(refrow[f_the_next_table], "rbw") as infile:
reader = csv.DictReader(infile, delimiter='|')
# table = [row for row in reader]
# header = reader.fieldnames
# print (header)
Before running the process to update other table,
you want to ensure that it's running for only Put events.
Here is few additions to your current steps after reading pds_ref:
Group all THE_KEYs by THE_TABLE.
This allows you to perform unique iterations to update table objects
instead of multiple ones for content in the same table object.
For each THE_TABLE group,
read the table object and filter away lines in THE_KEY group,
write the filtered contents to a table object.
This can be implemented in the following manner
from contextlib import contextmanager
from csv import DictReader, DictWriter
from collections import defaultdict
import io
import boto3
s3 = boto3.client("s3")
BUCKET = "creeper-bank"
DELIMITER = "|"
TABLE_OBJECT_COLUMNS = ['', 'ID', 'NAME']
WATCH_KEY = "pds_ref"
def content_as_dict_reader(content):
yield DictReader(
content.splitlines(),
delimiter=DELIMITER)
#contextmanager
def tables_and_lines_for_deletion():
object_ = s3.get_object(
Bucket=BUCKET, Key=WATCH_KEY
)
content = object_["Body"].read().decode('utf-8')
return content_as_dict_reader(content)
#contextmanager
def table_record(table):
object_ = s3.get_object(
Bucket=BUCKET, Key=table
)
content = object_["Body"].read().decode('utf-8')
return content_as_dict_reader(content)
def object_table(table, record):
with io.StringIO() as file_:
writer = DictWriter(
file_,
fieldnames=TABLE_OBJECT_COLUMNS,
delimiter=DELIMITER
)
writer.writeheader()
writer.writerows(list(record))
s3.put_object(
Bucket=BUCKET,
Key=table,
Body=file_.getvalue()
)
def lambda_handler(event, context):
if not event:
print("Function must be triggered via a published event")
return
event_record, *_ = event["Records"]
match_watchkey = True
try:
event_name = str(event_record['eventName'])
if "Put" not in event_name:
match_watchkey = False
s3_event = event_record['s3']
print("checking if S3 event is a put one for :WATCH_KEY")
key = s3_event['object']['key']
bucket = s3_event['bucket']['name']
if key != WATCH_KEY:
match_watchkey = False
if bucket != BUCKET:
match_watchkey = False
except KeyError:
# Handle when event_record isn't an S3 one.
match_watchkey = False
if not match_watchkey:
print("Published event did not match :WATCH_KEY.")
return
print("S3 event is a put one for :WATCH_KEY!")
table_group = defaultdict(list)
print("Reading :WATCH_KEY content")
with tables_and_lines_for_deletion() as tables:
for dct in tables:
table_k = dct['THE_TABLE']
table_v = dct['THE_KEY']
table_group[table_k].append(table_v)
print("Updating objects found in :WATCH_KEY content")
for t, ids in table_group.items():
record_update = None
with table_record(t) as record:
record_update = (
dct
for dct in record
if dct["ID"] not in ids
)
object_table(t, record_update)
print("Update completed!")
return
Testing with sample event
sample_event = {
'Records': [
{
'eventName': 'ObjectCreated:Put',
's3': {
'bucket': {
'name': 'creeper-bank',
},
'object': {
'key': 'pds_ref',
}
},
}
]
}
lambda_handler(sample_event, {})

Read data from binary file python

I have a binary file with this format:
and i use this code to open it:
import numpy as np
f = open("author_1", "r")
dt = np.dtype({'names': ['au_id','len_au_name','au_name','nu_of_publ', 'pub_id', 'len_of_pub_id','pub_title','num_auth','len_au_name_1', 'au_name1','len_au_name_2', 'au_name2','len_au_name_3', 'au_name3','year_publ','num_of_cit','citid','len_cit_tit','cit_tit', 'num_of_au_cit','len_cit_au_name_1','au_cit_name_1', len_cit_au_name_2',
'au_cit_name_2','len_cit_au_name_3','au_cit_name_3','len_cit_au_name_4',
'au_cit_name_4', 'len_cit_au_name_5','au_cit_name_5','year_cit'],
'formats': [int,int,'S13',int,int,int,'S61', int,int,'S8',int,'S7',int,'S12',int,int,int,int,'S50',int,int,
'S7',int,'S7',int,'S9',int,'S8',int,'S1',int]})
a = np.fromfile(f, dtype=dt, count=-1, sep="")
And I take this:
array([ (1, 13, b'Scott Shenker', 200, 1, 61, b'Integrated services in the internet architecture: an overview', 3, 8, b'R Braden', 7, b'D Clark', 12, b'S Shenker\xe2\x80\xa6', 1994, 1000, 401, 50, b'[HTML] An architecture for differentiated services', 5, 7, b'D Black', 7, b'S Blake', 9, b'M Carlson', 8, b'E Davies', 1, b'Z', 1998),
(402, 72, b'Resource rese', 1952544370, 544108393, 1953460848, b'ocol (RSVP)--Version 1 functional specification\x05\x00\x00\x00\x08\x00\x00\x00R Brad', 487013, 541851648, b'Zhang\x08', 1109414656, b'erson\x08', 542310400, b'Herzog\x07\x00\x00\x00S ', 1768776010, 511342, 103168, 22016, b'\x00A reliable multicast framework for light-weight s', 1769173861, 544435823, b'and app', 1633905004, b'tion le', 543974774, b'framing\x04', 458752, b'\x00\x00S Floy', 2660, b'', 1632247894),
Any idea how can open the whole file?
I agree with Ryan: parsing the data is straightforward, but not trivial, and really tedious. Whatever disk space saving you gain by packing the data in this way, you pay it dearly at the hour of unpacking.
Anyway, the file is made of variable length records and fields. Each record is made of variable number and length of fields that we can read in chunks of bytes. Each chunk will have different format. You get the idea. Following this logic, I assembled these three functions, that you can finish, modify, test, etc:
from struct import Struct
import struct
def read_chunk(fmt, fileobj):
chunk_struct = Struct(fmt)
chunk = fileobj.read(chunk_struct.size)
return chunk_struct.unpack(chunk)
def read_record(fileobj):
author_id, len_author_name = read_chunk('ii', f)
author_name, nu_of_publ = read_chunk(str(len_author_name)+'ci', f) # 's' or 'c' ?
record = { 'author_id': author_id,
'author_name': author_name,
'publications': [] }
for pub in range(nu_of_publ):
pub_id, len_pub_title = read_chunk('ii', f)
pub_title, num_pub_auth = read_chunk(str(len_pub_title)+'ci', f)
record['publications'].append({
'publication_id': pub_id,
'publication_title': pub_title,
'publication_authors': [] })
for auth in range(num_pub_auth):
len_pub_auth_name = read_chunk('i', f)
pub_auth_name = read_chunk(str(len_pub_auth_name)+'c', f)
record['publications']['publication_authors'].append({'name': pub_auth_name})
year_publ, nu_of_cit = read_chunk('ii', f)
# Finish building your record with the remaining fields...
for cit in range(nu_of_cit):
cit_id, len_cit_title = read_chunk('ii', f)
cit_title, num_cit_auth = read_chunk(str(len_cit_title)+'ci', f)
for cit_auth in range(num_cit_auth):
len_cit_auth_name = read_chunk('i', f)
cit_auth_name = read_chunk(str(len_cit_auth_name)+'c', f)
year_cit_publ = read_chunk('i', f)
return record
def parse_file(filename):
records = []
with open(filename, 'rb') as f:
while True:
try:
records.append(read_record(f))
except struct.error:
break
# do something useful with the records...
The data structure stored in this file is hierarchical, rather than "flat": child arrays of different length are stored within each parent element. It is not possible to represent such a data structure using numpy arrays (even recarrays), and therefore it is not possible to read the file with np.fromfile().
What do you mean by "open the whole file"? What sort of python data structure would you like to end up with?
It would be straightforward, but still not trivial, to write a function to parse the file into a list of dictionaries.

Categories

Resources