how to convert lines of json into a hashmap of composite keys? - python

Background on the actual problem: I am trying to create an AWS Lambda function in Python that accumulates records from a DynamoDB stream into an S3 object. If you don't understand this context you can just ignore it, the question is really a pure Python question.
I got the code below barely working, the file is successfully concatenated with new records from the stream, in the desired format (one JSON object per line). But what I really want is to treat the file as a hashmap, using the fields in keys (4th line of the function definition), which are a subset of the fields in new, so that any records coming in will overwrite old records containing the same key values.
What is the obvious / idiomatic way to change the line journal += data so that instead of a concatenation, I get an overwrite of lines of the same keys value?
import json
import boto3
import re
import uuid
from decimal import Decimal
import six
import sys
from datetime import datetime
from boto3.dynamodb.types import TypeSerializer
s3 = boto3.resource('s3')
def lambda_handler(event, context):
object = s3.Object('some.bucket', 'address/dynamo-stream.json')
journal = object.get()['Body'].read().decode('utf-8')
for record in event['Records']:
keys = record['dynamodb'].get('Keys')
new = record['dynamodb'].get('NewImage')
if new:
data = json.dumps(loads(new))
journal += data + "\n"
object.put(Body=journal)
return "ok"
# below: code from https://github.com/Alonreznik/dynamodb-json/blob/master/dynamodb_json/json_util.py
[...]
def loads(s, as_dict=False, *args, **kwargs):
[...]
More explanation:
The variable keys is a subset of new in the sense that, for any json value of new in the format
{ "k1":"v1", "k2:v2", "k3:v3", ... "kN:vN" }
keys will have the value
{ "k1":"v1", "k2:v2" }

Related

"KeyError: 'added' "when importing different lists of json

I'm getting a "KeyError: 'added'" Error when i try to import one of my three lists inside a json file. The list i'm trying to import is the one called "added". I had this working when it was just one list without the name "added" on top but now it seems like i can't access the list anymore. I want to individually import them basically.
This is my code to get the Json parts before i import them into the database:
import requests
import json
from users.models import Facility, FacilityAddress, FacilityInspectionInfo, FacilityComplaints
from django.core.management.base import BaseCommand
IMPORT_URL = 'https://url/imports.json'
class Command(BaseCommand):
def import_facility_from_file(self, data):
UUID = data.get('UUID', None)
Name = data.get('Name', None)
PrimaryAddress = data["AddressInfo"]["PrimaryAddress"]
""" This is what happens after the database entry related code """
def handle(self, *args, **options):
"""
Call the function to import data from json url
"""
headers = {'Content-Type': 'application/json'}
response = requests.get(
url=IMPORT_URL,
headers=headers,
)
response.raise_for_status()
data = response.json()
for key, data_object in data.items():
self.import_facility_from_file(data_object)
The new version of the Json File that im trying to use but thats causing the error:
{
"added":
{"125hk24h5kjh43k5":
{
"UUID":"125hk24h5kjh43k5",
"Name":"Test Facility 1",
"AddressInfo":
{"PrimaryAddress":"1234 Drive RD"},
"ImporterLastModifiedTimestamp":1643721420}},
"deleted":["235hk24h5kjh43k5,235hk345789h43k5"],
"modified":{"995hk24h5kjh43k5":
{
"UUID":"995hk24h5kjh43k5",
"Name":"Test Facility 2",
"AddressInfo":
{"PrimaryAddress":"2345 Test RD"},
"ImporterLastModifiedTimestamp":1643721420}
}
}
The old version of the json file that worked perfectly with the code i intially wrote:
{"00016ed7be4872a19d6e16afc98a7389b2bb324a2":
{"UUID":"00016ed7be4872a19d6e1ed6f36b647f3eb41cadedd2130b103a5851caebc26fbbbf24c2f1a64d2cf34ac4e03aaa30309816f58c397e6afc98a7389b2bb324a2","Name":"Test Facility","IssuedNumber":"123456","Licensee":"Test Licensee","Email":"test#example.com","AdministratorName":"Test Name","TelephoneNumber":"(123) 456-7890324879","ImporterLastModifiedTimestamp":"1362985200",
"AddressInfo":{"PrimaryAddress":"123 Fake Road","SecondaryAddress":"","City":"Testcity","RegionOrState":"TX","PostalCode":"12345","Geolocation":"00.0000,-00.0000"},"Capacity":100,"MostRecentLicenseTimestamp":1575180000,"ClosedTimestamp":0,
"InspectionInfo":{"ComplaintRelatedVisits":0,"InspectionRelatedVisits":0,"NumberOfVisits":0,"LastVisitTimestamp":0},
"Complaints":{"ComplaintsTypeA":0,"ComplaintsTypeB":0,"SubstantiatedAllegations":0,"TotalAllegations":0}},
"00016ed7be4872a15435435435b2bb324a2":
{"UUID":"000c93dcb7a0b3d5783bb330892aff6abdb9fb57a7d3701c2d903f3640877579f3173ecd8a80532f6c3d53dbacde78a6a54ae42fef321a5793f5a01934f8de7a","Name":"Test Facility 2","IssuedNumber":"123456","Licensee":"Test Licensee","Email":"test#example.com","AdministratorName":"Test Name","TelephoneNumber":"(123) 456-7890324879","ImporterLastModifiedTimestamp":"1362985200",
"AddressInfo":{"PrimaryAddress":"123 Fake Road","SecondaryAddress":"","City":"Testcity","RegionOrState":"TX","PostalCode":"12345","Geolocation":"00.0000,-00.0000"},"Capacity":100,"MostRecentLicenseTimestamp":1575180000,"ClosedTimestamp":0,
"InspectionInfo":{"ComplaintRelatedVisits":0,"InspectionRelatedVisits":0,"NumberOfVisits":0,"LastVisitTimestamp":0},
"Complaints":{"ComplaintsTypeA":0,"ComplaintsTypeB":0,"SubstantiatedAllegations":0,"TotalAllegations":0}},
"00234324324343243afc98a7389b2bb324a2":
{"UUID":"fffd4dec10054e6e1deb2a2266a7c6bb0136ba46222e734ceed5855651f735cfbe0bb66cfaf27c3d175ae261a8f6df0c36b5390d15c70b07d67e35e1081aaf6d","Name":"Test Facility 3","IssuedNumber":"123456","Licensee":"Test Licensee","Email":"test#example.com","AdministratorName":"Test Name","TelephoneNumber":"(123) 456-7890324879","ImporterLastModifiedTimestamp":"1362985200",
"AddressInfo":{"PrimaryAddress":"123 Fake Road","SecondaryAddress":"","City":"Testcity","RegionOrState":"TX","PostalCode":"12345","Geolocation":"00.0000,-00.0000"},"Capacity":100,"MostRecentLicenseTimestamp":1575180000,"ClosedTimestamp":0,
"InspectionInfo":{"ComplaintRelatedVisits":0,"InspectionRelatedVisits":0,"NumberOfVisits":0,"LastVisitTimestamp":0},
"Complaints":{"ComplaintsTypeA":0,"ComplaintsTypeB":0,"SubstantiatedAllegations":0,"TotalAllegations":0}}}
So i tried it like this to get the UUID and other information from the modified json file:
UUID = data["added"]["UUID"]
but i'm getting this error:
KeyError: 'added'
It sounds like the json format has changed underneath you and you need to adapt.
How about processing all the items in both the "added" and "modified" sections?
for key, data_object in data.items():
if key in ["added", "modified"]:
for key, data in data_object.items():
self.import_facility_from_file(data)
AttributeError: 'list' object has no attribute 'get' because you've list data type in between your dictionary so you can add check for type like this
for key, data_object in data.items():
if type(data_object) == dict:
for data in data_object.values():
self.import_facility_from_file(data)

table.put_item w/ Partition and Sort Key in Python (Lambda to DynamoDB)

I am altering a Lambda function (written in Python) that puts an item into a DynamoDB table. I have no issues when using the function to write to a DDB table with only a primary key, but get the following error after aligning attribute names and writing to a table with a partition and sort key:
An error occurred (ValidationException) when calling the PutItem operation:
One or more parameter values were invalid: Missing the key ID
Here is my function:
# import the json utility package since we will be working with a JSON object
import json
# import the AWS SDK (for Python the package name is boto3)
import boto3
# import two packages to help us with dates and date formatting
from time import gmtime, strftime
# create a DynamoDB object using the AWS SDK
dynamodb = boto3.resource('dynamodb')
# use the DynamoDB object to select our table
table = dynamodb.Table('PSCartTracking')
# store the current time in a human readable format in a variable
now = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
# define the handler function that the Lambda service will use as an entry point
def lambda_handler(event, context):
# extract values from the event object we got from the Lambda service and store in a variable
login = event['login']
computer = event['computer']
timestamp = event['timestamp']
# write name and time to the DynamoDB table using the object we instantiated and save response in a variable
response = table.put_item(
Item={
'ID': computer,
'Timestamp': timestamp,
'Login': login
})
# return a properly formatted JSON object
return {
'statusCode': 200,
'body': json.dumps(login + ' checked out ' + computer + ' on ' + now)
}
Keys for Table:
Partition key
ID (String)
Sort key
Timestamp (String)
I've been trying to read documentation to figure out what about my formatting might be preventing it from recognizing the sort key, but I'm pretty positive that is the part of the composite it is not seeing.

How to retrieve data stored from MongoDB using BSON (Python)?

so far in my code bellow I managed to store my data into mongoDB.
Now I want to be able to retrieve the data I have stored.
As you can see I have been trying but keep on getting an error.
With BSON do I have to first decode the data to retrieve it from mongoDB?
Any help would be greatly appreciated!
(Apologies for the messy code, I am just practicing through trial and error)
import json
from json import JSONEncoder
import pymongo
from pymongo import MongoClient
from bson.binary import Binary
import pickle
#Do this for each
client = MongoClient("localhost", 27017)
db = client['datacampdb']
coll = db.personpractice4_collection #creating a collection in the database
#my collection on the database is called personpractice4_collection
class Person:
def __init__(self, norwegian, dame, brit, german, sweed):
self.__norwegian = norwegian
self.__dame = dame
self.__brit = brit
self.__german = german #private variable
self.__sweed = sweed
# create getters and setters later to make OOP
personone = Person("norwegian", "dame", "brit", "german","sweed")
class PersonpracticeEncoder(JSONEncoder):
def default(self, o):
return o.__dict__
#Encode Person Object into JSON"
personpracticeJson = json.dumps(personone, indent=4, cls=PersonpracticeEncoder)
practicedata = pickle.dumps(personpracticeJson)
coll.insert_one({'bin-data': Binary(practicedata)})
#print(personpracticeJson)
#print(db.list_collection_names()) #get then names of my collections in DB
#retriving data from mongodb
#Retrieving a Single Document with find_one()
print(({'bin-data': Binary(practicedata)}).find_one()) #not working
the find_one method should be called on a collection
{'bin-data': Binary(practicedata)} is a query to find a document
coll.find_one({'bin-data': Binary(practicedata)})
Witch means : Find a document in the collection coll where bin-data is equal to Binary(practicedata)

cURL method in Python for JSON feed [duplicate]

This question already has answers here:
How to download a file over HTTP?
(30 answers)
Closed 7 years ago.
While building a flask website, I'm using an external JSON feed to feed the local mongoDB with content. This feed is parsed and fed while repurposing keys from the JSON to keys in Mongo.
One of the available keys from the feed is called "img_url" and contains, guess what, an url to an image.
Is there a way, in Python, to mimic a php style cURL? I'd like to grab that key, download the image, and store it somewhere locally while keeping other associated keys, and have that as an entry to my db.
Here is my script up to now:
import json
import sys
import urllib2
from datetime import datetime
import pymongo
import pytz
from utils import slugify
# from utils import logger
client = pymongo.MongoClient()
db = client.artlogic
def fetch_artworks():
# logger.debug("downloading artwork data from Artlogic")
AL_artworks = []
AL_artists = []
url = "http://feeds.artlogic.net/artworks/artlogiconline/json/"
while True:
f = urllib2.urlopen(url)
data = json.load(f)
AL_artworks += data['rows']
# logger.debug("retrieved page %s of %s of artwork data" % (data['feed_data']['page'], data['feed_data']['no_of_pages']))
# Stop we are at the last page
if data['feed_data']['page'] == data['feed_data']['no_of_pages']:
break
url = data['feed_data']['next_page_link']
# Now we have a list called ‘artworks’ in which all the descriptions are stored
# We are going to put them into the mongoDB database,
# Making sure that if the artwork is already encoded (an object with the same id
# already is in the database) we update the existing description instead of
# inserting a new one (‘upsert’).
# logger.debug("updating local mongodb database with %s entries" % len(artworks))
for artwork in AL_artworks:
# Mongo does not like keys that have a dot in their name,
# this property does not seem to be used anyway so let us
# delete it:
if 'artworks.description2' in artwork:
del artwork['artworks.description2']
# upsert int the database:
db.AL_artworks.update({"id": artwork['id']}, artwork, upsert=True)
# artwork['artist_id'] is not functioning properly
db.AL_artists.update({"artist": artwork['artist']},
{"artist_sort": artwork['artist_sort'],
"artist": artwork['artist'],
"slug": slugify(artwork['artist'])},
upsert=True)
# db.meta.update({"subject": "artworks"}, {"updated": datetime.now(pytz.utc), "subject": "artworks"}, upsert=True)
return AL_artworks
if __name__ == "__main__":
fetch_artworks()
First, you might like the requests library.
Otherwise, if you want to stick to the stdlib, it will be something in the lines of:
def fetchfile(url, dst):
fi = urllib2.urlopen(url)
fo = open(dst, 'wb')
while True:
chunk = fi.read(4096)
if not chunk: break
fo.write(chunk)
fetchfile(
data['feed_data']['next_page_link'],
os.path.join('/var/www/static', uuid.uuid1().get_hex()
)
With the correct exceptions catching (i can develop if you want, but i'm sure the documentation will be clear enough).
You could put the fetchfile() into a pool of async jobs to fetch many files at once.
https://docs.python.org/2/library/json.html
https://docs.python.org/2/library/urllib2.html
https://docs.python.org/2/library/tempfile.html
https://docs.python.org/2/library/multiprocessing.html

Store a list of dictionaries in GAE

I have a list of about 20 objects and for each object I return a list of 10 dictionaries.
I am trying to store the list of 10 dictionaries for each object in the list on GAE; I do not think I am writing the code correctly to store this information to GAE.
Here is what I have:
Before my main request handler I have this class:
class Tw(db.Model):
tags = db.ListProperty()
ip = db.StringProperty()
In my main request handler I have the following:
for city in lst_of_cities: # this is the list of 20 objects
dict_info = hw12.twitter(city) # this is the function to get the list of 10 dictionaries for each object in the list
datastore = Tw() # this is the class defined for db.model
datastore.tags.append(dict_info) #
datastore.ip = self.request.remote_addr
datastore.put()
data = Data.gql("") #data entities we need to fetch
I am not sure if this code is write at all. If anyone could please help it would be much appreciated.
Welcome to Stack Overflow!
I see a few issues:
Dictionaries are not supported value types for App Engine properties.
You're only storing the last entity; the rest are discarded.
You're using a ListProperty, but instead of appending each element of dict_info, you're doing a single append of the entire list.
Since you can't store a raw dictionary inside a property, you need to serialize it to some other format, like JSON or pickle. Here's a revised example using pickle:
from google.appengine.ext import db
import pickle
class Tw(db.Model):
tags = db.BlobProperty()
ip = db.StringProperty()
entities = []
for city in lst_of_cities:
dict_info = hw12.twitter(city)
entity = Tw()
entity.tags = db.Blob(pickle.dumps(dict_info))
entity.ip = self.request.remote_addr
entities.append(entity)
db.put(entities)
When you fetch the entity later, you can retrieve your list of dictionaries with pickle.loads(entity.tags).
When I deal with data types that are not directly supported by Google App Engine like dictionaries or custom data type, I usually adopt the handy PickleProperty.
from google.appengine.ext import db
import pickle
class PickleProperty(db.Property):
def get_value_for_datastore(self, model_instance):
value = getattr(model_instance, self.name, None)
return pickle.dumps(value)
def make_value_from_datastore(self, value):
return pickle.loads(value)
Once declared the PickleProperty class in your commons.py module, you can use it to store your custom data with something like this:
from google.appengine.ext import db
from commons import PickleProperty
class Tw(db.Model):
tags = PickleProperty()
ip = db.StringProperty()
entities = []
for city in lst_of_cities:
dict_info = hw12.twitter(city)
entity = Tw()
entity.tags = dict_info
entity.ip = self.request.remote_addr
entities.append(entity)
db.put(entities)
To retrieve the data back go with:
entity.tags
Since this was written, the App Engine has pushed out their experimental "ndb" Python database model, which contains in particular the JsonProperty, something that pretty well directly implements what you want.
Now, you need to be running the Python 2.7 version of the App Engine, which is still not quite ready for production, but it all seems pretty stable these days, GvR himself seems to be writing a lot of the code which bodes well for the code quality, and I'm intending to use this in production sometime this year...

Categories

Resources