I am trying to use data from one json file to update another. In trying to be efficient for searching, I'm attempting to use a lambda expression to locate the correct record to update.
The goal is to be able to most efficiently update the "PreviousMappings" key.
Code:
for p in Path('jobs/data').glob('*.json'):
with open(p, 'r') as f:
print('Loaded - ', f.name)
jdata = json.load(f)
for j in jdata['Mappings']:
jc = j['Job Code']
with open('newdata/jobs.json', 'r+') as s:
print('Loaded - ', s.name)
data = json.load(s)
found = list(filter(lambda x:x['Jobcode'] == jc, data)) #throws exception
JSON:
{
"JobCodes": [
{
"Bid Code": "123-456",
"Description": "JOB DESCRIPTION",
"Jobcode": "ABC123",
"PreviousMappings": ""
}
]
}
This does what you're asking, but you might consider a different approach.
data = json.load( open('newdata/jobs.json', 'r') )
for p in Path('jobs/data').glob('*.json'):
with open(p, 'r') as f:
print('Loaded - ', f.name)
jdata = json.load(f)
for j in jdata['Mappings']:
jc = j['Job Code']
for k in data:
if k['Jobcode'] == jc:
k['PreviousMappings'] = "something"
break
json.dump( open('newdata/jobs.json','w'), data )
If you have a LOT of files, you might consider building an index, so you can do a direct lookup. For example (untested):
data = json.load( open('newdata/jobs.json', 'r') )
dindex = {}
for k in data:
dindex[k['Jobcode']] = k
Now you don't have to search -- Python will do it:
for j in jdata['Mappings']:
jc = j['Job Code']
if jc in dindex:
dindex[jc]['PreviousMappings'] = "something"
Related
I have a file (my_file.json) has contents as below;
[
{
"use":"abcd",
"contact":"xyz",
"name":"my_script.py",
"time":"11:22:33"
},
{
"use":"abcd"
"contact":"xyz",
"name":"some_other_script.py",
"time":"11:22:33"
},
{
"use":"apqwkndf",
"contact":"xyz",
"name":"my_script.py",
"time":"11:22:33"
},
{
"use":"kjdshfjkasd",
"contact":"xyz",
"name":"my_script.py",
"time":"11:22:33"
}
]
I used following python code to delete the objects that has "name":"my_script.py",
#!/bin/usr/python
impoty json
obj = json.load(open("my_file.json"))
index_list = []
for i in xrange(len(obj)):
if obj[i]["name"] == ["my_script.py"]
index_list.append(i)
for x in range(len(index_list)):
obj.pop(index_list[x])
open("output_my_file.json","w".write(json.dumps(obj, indent=4, separators=(',',': ')))
but it seems I am stuck, because after popping an index the index position in actual obj gets changed, which leads to wrong index deletion or sometimes pop index gets out of range. Any other solution?
Try popping in reverse order:
for x in reversed(range(len(index_list))):
This will create a new list and assign only those without "name": "my_script.py" to the new list.
obj = [i for i in obj if i["name"] != "my_script.py"]
import json
with open('my_file.json') as f:
data = json.load(f)
data = [item for item in data if item.get('name') != 'my_script.py']
with open('output_my_file.json', 'w') as f:
json.dump(data, f, indent=4)
Try:
import json
json_file = json.load(open("file.json"))
for json_dict in json_file:
json_dict.pop("name",None)
print(json.dumps(json_file, indent=4))
You don't need the last line where it says 'json.dumps' I just have it there so it looks more readable when printed.
As a general rule of thumb, you usually don't want to ever change an iterable while iterating over it.
I suggest you save the elements you do want in the first loop:
import json
with open('path/to/file', 'r') as f:
data = json.load(f)
items_to_keep = []
for item in data:
if item['name'] != 'my_script.py':
items_to_keep.append(item)
with open('path/to/file', 'w') as f:
json.dump(items_to_keep, f, ...)
The filtering can be reduced into a single line (called list-comprehension)
import json
with open('path/to/file', 'r') as f:
data = json.load(f)
items_to_keep = [item for item in data if item['name'] != 'my_script.py']
with open('path/to/file', 'w') as f:
json.dump(items_to_keep, f, ...)
I have the following code which prints the object as CSV:
title = ['Username', 'Name', 'Job']
for x in title:
print(x, end =",")
for d in data:
line = d.get_username() + "," + d.get_name() + "," + d.get_role()
print(line)
I get:
Username,Name,Job
rob,robert,developer
danny21,danny,developer
I want to print the same data as JSON in order to get:
[
{
"Username":"rob",
"Name":"robert",
"Job":"developer"
},
{
"Username":"danny21",
"Name":"danny",
"Job":"developer"
}
]
From previous topics I learn that we can use json.dumps but I'm not sure if it helps in this case.
What is the proper way to achieve it?
You could simply do:
l = []
for d in data:
user_dictionary = {}
user_dictionary[title[0]] = d.get_username()
user_dictionary[title[1]] = d.get_name()
user_dictionary[title[2]] = d.get_role()
l.append(user_dictionary)
to get a json like file.
You can also avoid appending and do:
def get_user_data(user):
user_dictionary = {}
user_dictionary[title[0]] = d.get_username()
user_dictionary[title[1]] = d.get_name()
user_dictionary[title[2]] = d.get_role()
return user_dictionary
l = list(map(get_user_data, data))
You can use json.dump to dump l in a file
import json
with open('data.json', 'w') as outfile:
json.dump(l, outfile)
This is a python code that I used to manipulate a file table1 using a reference file pds_ref
So pds_ref looks like this :
|THE_TABLE|THE_KEY
|table1|3
|table1|1
table1 looks like this
|ID|NAME
|1|Imran
|2|Peter
|3|Pedro
|4|Carlos
The idea is to use the references in pds_ref to remove the records in whatever table is being listed and its corresponding key...in this case 1 and 3 are to deleted
This python code works just as python
import csv
with open("pds_ref","rb") as ref_file:
refreader=csv.DictReader(ref_file, delimiter='|')
reftable=[row for row in refreader]
refheader = refreader.fieldnames
for refrow in reftable:
print refrow['THE_TABLE']
print refrow['THE_KEY']
with open(refrow['THE_TABLE'], "rbw") as infile:
reader = csv.DictReader(infile, delimiter='|')
table = [row for row in reader]
header = reader.fieldnames
with open(refrow['THE_TABLE'], "wb") as outfile:
writer = csv.DictWriter(outfile, header,delimiter='|')
writer.writeheader()
for row in table:
if row['ID'] != refrow['THE_KEY'] :
writer.writerow(row)
Now, I want to do this using lambda such that the function is triggered evertime someone uploads the pds_ref file
I got as far as being able to get the pds_ref file and read each line but having trouble doing the equivalent of opening and writing back the amended table1 file. Any help appreciated.
import boto3
import csv
import io
def lambda_handler(event, context):
s3 = boto3.client("s3")
if event:
print ("Event : ", event)
file_obj = event["Records"][0]
filename = str(file_obj['s3']['object']['key'])
bucketname = str(file_obj['s3']['bucket']['name'])
print("Filename: ",filename)
print("Bucket: ",bucketname)
fileObj = s3.get_object(Bucket= "lambda-trig1",Key=filename)
print ("fileObj: ",fileObj)
file_content = fileObj["Body"].read().decode('utf-8')
print(file_content)
f_pds_ref = s3.get_object(Bucket= "lambda-trig1",Key='pds_ref')
fc_pds_ref = f_pds_ref['Body'].read().decode('utf-8').splitlines(True)
for refrow in csv.DictReader(fc_pds_ref,delimiter='|'):
print refrow['THE_TABLE']
print refrow['THE_KEY']
current_table = refrow['THE_TABLE']
current_key = refrow['THE_KEY']
f_the_next_table = s3.get_object(Bucket= "lambda-trig1",Key=current_table)
fc_the_next_table = f_the_next_table['Body'].read().decode('utf-8').splitlines(True)
with open(refrow[f_the_next_table], "rbw") as infile:
reader = csv.DictReader(infile, delimiter='|')
# table = [row for row in reader]
# header = reader.fieldnames
# print (header)
Before running the process to update other table,
you want to ensure that it's running for only Put events.
Here is few additions to your current steps after reading pds_ref:
Group all THE_KEYs by THE_TABLE.
This allows you to perform unique iterations to update table objects
instead of multiple ones for content in the same table object.
For each THE_TABLE group,
read the table object and filter away lines in THE_KEY group,
write the filtered contents to a table object.
This can be implemented in the following manner
from contextlib import contextmanager
from csv import DictReader, DictWriter
from collections import defaultdict
import io
import boto3
s3 = boto3.client("s3")
BUCKET = "creeper-bank"
DELIMITER = "|"
TABLE_OBJECT_COLUMNS = ['', 'ID', 'NAME']
WATCH_KEY = "pds_ref"
def content_as_dict_reader(content):
yield DictReader(
content.splitlines(),
delimiter=DELIMITER)
#contextmanager
def tables_and_lines_for_deletion():
object_ = s3.get_object(
Bucket=BUCKET, Key=WATCH_KEY
)
content = object_["Body"].read().decode('utf-8')
return content_as_dict_reader(content)
#contextmanager
def table_record(table):
object_ = s3.get_object(
Bucket=BUCKET, Key=table
)
content = object_["Body"].read().decode('utf-8')
return content_as_dict_reader(content)
def object_table(table, record):
with io.StringIO() as file_:
writer = DictWriter(
file_,
fieldnames=TABLE_OBJECT_COLUMNS,
delimiter=DELIMITER
)
writer.writeheader()
writer.writerows(list(record))
s3.put_object(
Bucket=BUCKET,
Key=table,
Body=file_.getvalue()
)
def lambda_handler(event, context):
if not event:
print("Function must be triggered via a published event")
return
event_record, *_ = event["Records"]
match_watchkey = True
try:
event_name = str(event_record['eventName'])
if "Put" not in event_name:
match_watchkey = False
s3_event = event_record['s3']
print("checking if S3 event is a put one for :WATCH_KEY")
key = s3_event['object']['key']
bucket = s3_event['bucket']['name']
if key != WATCH_KEY:
match_watchkey = False
if bucket != BUCKET:
match_watchkey = False
except KeyError:
# Handle when event_record isn't an S3 one.
match_watchkey = False
if not match_watchkey:
print("Published event did not match :WATCH_KEY.")
return
print("S3 event is a put one for :WATCH_KEY!")
table_group = defaultdict(list)
print("Reading :WATCH_KEY content")
with tables_and_lines_for_deletion() as tables:
for dct in tables:
table_k = dct['THE_TABLE']
table_v = dct['THE_KEY']
table_group[table_k].append(table_v)
print("Updating objects found in :WATCH_KEY content")
for t, ids in table_group.items():
record_update = None
with table_record(t) as record:
record_update = (
dct
for dct in record
if dct["ID"] not in ids
)
object_table(t, record_update)
print("Update completed!")
return
Testing with sample event
sample_event = {
'Records': [
{
'eventName': 'ObjectCreated:Put',
's3': {
'bucket': {
'name': 'creeper-bank',
},
'object': {
'key': 'pds_ref',
}
},
}
]
}
lambda_handler(sample_event, {})
I use this piece of code:
from struct import Struct
import struct
def read_chunk(fmt, fileobj):
chunk_struct = Struct(fmt)
chunk = fileobj.read(chunk_struct.size)
return chunk_struct.unpack(chunk)
def read_record(fileobj):
author_id, len_author_name = read_chunk('ii', f)
author_name, nu_of_publ = read_chunk(str(len_author_name)+'si', f) # 's' or 'c' ?
record = { 'author_id': author_id,
'author_name': author_name,
'publications': [] }
for pub in range(nu_of_publ):
pub_id, len_pub_title = read_chunk('ii', f)
pub_title, num_pub_auth = read_chunk(str(len_pub_title)+'si', f)
record['publications'].append({
'publication_id': pub_id,
'publication_title': pub_title,
'publication_authors': [] })
for auth in range(num_pub_auth):
len_pub_auth_name = read_chunk('i', f)
pub_auth_name = read_chunk(str(len_pub_auth_name)+'s', f)
record['publications']['publication_authors'].append({'name': pub_auth_name})
year_publ, nu_of_cit = read_chunk('ii', f)
# Finish building your record with the remaining fields...
for cit in range(nu_of_cit):
cit_id, len_cit_title = read_chunk('ii', f)
cit_title, num_cit_auth = read_chunk(str(len_cit_title)+'si', f)
for cit_auth in range(num_cit_auth):
len_cit_auth_name = read_chunk('i', f)
cit_auth_name = read_chunk(str(len_cit_auth_name)+'s', f)
year_cit_publ = read_chunk('i', f)
return record
def parse_file(filename):
records = []
with open(filename, 'rb') as f:
while True:
try:
records.append(read_record(f))
except struct.error:
break
to read this file:
https://drive.google.com/open?id=0B3SYAHrxLP69NHlWc25KeXFHNVE
with this format:
Inside the function read_record, it read correct variables author_id, len_author_name, author_name but the nu_of_publ and below variables aren't read correct.
Any idea what's wrong?
When i run this piece of code:
author_id, len_author_name = read_chunk('LL', f)
author_name, nu_of_publ= read_chunk(str(len_author_name)+'sL', f)
#nu_of_publ = read_chunk('I', f)# 's' or 'c' ?
record = { 'author_id': author_id,
'author_name': author_name,
'publications': [] }
print (record, nu_of_publ)
for pub in range(nu_of_publ):
pub_id, len_pub_title = read_chunk('LL', f)
print (pub_id, len_pub_title)
i take this result:
{'author_name': b'Scott Shenker', 'author_id': 1, 'publications': []} 256
15616 1953384704
but it will print 200 instead 256, 1 instead 15616 etc.
This format is not correct:
author_name, nu_of_publ = read_chunk(str(len_author_name)+'si', f)
You are defining a structure of N characters and an integer. Those structures are aligned, the same way as they would if you had the structure defined in c:
struct {
char author_name[N];
int nu_of_publ;
};
What alignment does is: it puts beginning of every int to a position which is a multiple of 4. This is done (in C) because CPUs are optimized for accessing such addresses.
So, if author's name's length is 6, the next two bytes will be skipped before reading the next integer.
One solution to separate the structures:
author_name = read_chunk(str(len_author_name)+'s', f)
nu_of_publ, = read_chunk('i', f)
Note: The comma after nu_of_publ (nu_of_publ,) is to unpack the tuple returned by read_chunk.
Another solution is to specify structure with = at the beginning, based on the table from spec:
author_name, nu_of_publ = read_chunk('={}si'.format(len_author_name), f)
I'm currently building large xml files with xml.dom.minidom and then writing them out to file via the toprettyxml. is there a way to stream the xml to a document because I'm hitting memory errors.
def run(self):
while True:
domain = self.queue.get()
try:
conn = boto.connect_sdb(awsa, awss)
sdbdomain = conn.get_domain(domain)
s3conn = boto.connect_s3(awsa, awss)
archbucket = s3conn.get_bucket("simpledbbu")
doc = None
doc = Document()
root = doc.createElement("items")
doc.appendChild(root)
countermax = 0
counter = 0
for item in sdbdomain:
node = doc.createElement("item")
node.setAttribute("itemName", item.name)
for k,v in item.items():
if not isinstance(v, basestring):
i = 0
for val in v:
node.setAttribute("{0}::{1}".format(k,i),val)
i += 1
else:
node.setAttribute(k,v)
root.appendChild(node)
k = Key(archbucket)
k.key = "{0}/{1}.xml".format(datetime.date.today().strftime("%Y%m%d"),sdbdomain.name)
#x = doc.toprettyxml(indent=" ")
f = open(domain + ".xml", "w")
f.truncate()
f.write(doc.toprettyxml(indent=" "))
f.close()
#k.content_type.encode('ascii')
k.set_contents_from_filename(f.name)
os.remove(os.path.join(os.getcwd(),f.name))
except:
print "failed to load domain: {0}".format(domain)
print formatExceptionInfo()
finally:
self.queue.task_done()
building large xml files with xml.dom.minidom and then writing them out to file via the toprettyxml.
If you run out of memory you should probably stop doing that.
You can build XML with simple string manipulation.
with open(domain + ".xml", "w") as f:
f.write( "<?xml..." )
f.write( "<items>" )
for item in sdbdomain:
buffer= []
for k,v in item.items():
if not isinstance(v, basestring):
for i, val in enumerate(v):
txt= '{0}::{1}="{2}"'.format(k,i,val)
else:
txt= '{0}="{1}"'.format(k,v)
buffer.append( txt )
f.write( " <item {0}/>\n".format( " ".join(buffer) ))
f.write( "</items>" )
k= ................
k.set_contents_from_filename(f.name)
Something like that ought to allow you to write the XML to a temporary file without making a large DOM object in memory.