Django ORM how to get raw values grouped by a field - python

I have a model which is like so:
class CPUReading(models.Model):
host = models.CharField(max_length=256)
reading = models.IntegerField()
created = models.DateTimeField(auto_now_add=True)
I am trying to get a result which looks like the following:
{
"host 1": [
{
"created": DateTimeField(...),
"value": 20
},
{
"created": DateTimeField(...),
"value": 40
},
...
],
"host 2": [
{
"created": DateTimeField(...),
"value": 19
},
{
"created": DateTimeField(...),
"value": 10
},
...
]
}
I need it grouped by host and ordered by created.
I have tried a bunch of stuff including using values() and annotate() in order to create a GROUP BY statement, but I think I must be missing something because in order to use GROUP BY it seems I need to use some aggregation function which I don't really want to do. I need the actual values of the reading field grouped by the host field and ordered by the created field.
This is more-or-less how any charting library needs the data.
I know I can make it happen with either python code or with raw sql queries, but I'd much prefer to use the django ORM, unless it explicitly disallows this sort of query.

As far as I'm aware, there's nothing in the ORM that makes this easy. If you want to do it in the ORM without raw queries, and if you're willing and able to change your data structure, you can solve this mostly in the ORM, with Python code kept to a minimum:
class Host(models.Model):
pass
class CPUReading(models.Model):
host = models.ForeignKey(Host, related_name="readings", on_delete=models.CASCADE)
reading = models.IntegerField()
created = models.DateTimeField(auto_now_add=True)
With this you can use two queries with fairly clean code:
from collections import defaultdict
results = defaultdict(list)
hosts = Host.objects.prefetch_related("readings")
for host in hosts:
for reading in host.readings.all():
results[host.id].append(
{"created": reading.created, "value": reading.reading}
)
Or you can do it a little more efficiently with one query and a single loop:
from collections import defaultdict
results = defaultdict(list)
readings = CPUReading.objects.select_related("host")
for reading in readings:
results[reading.host.id].append(
{"created": reading.created, "value": reading.reading}
)

Assuming you are using PostgreSQL you can use a combination of array_agg and json_object to achieve what you're after.
from django.contrib.postgres.aggregation import ArrayAgg
from django.contrib.postgres.fields import ArrayField, JSONField
from django.db.models import CharField
from django.db.models.expressions import Func, Value
class JSONObject(Func):
function = 'json_object'
output_field = JSONField()
def __init__(self, **fields):
fields, expressions = zip(*fields.items())
super().__init__(
Value(fields, output_field=ArrayField(CharField())),
Func(*expressions, template='array[%(expressions)s]'),
)
readings = dict(CPUReading.objects.values_list(
'host',
ArrayAgg(
JSONObject(
created_at='created_at',
value='value',
),
ordering='created_at',
),
))

If you want to stay close to the Django ORM, you just need to remember this doesn't return a queryset but a dictionary and is evaluated on the fly, so don't use this in declarative scope. However, the interface is similar to QuerySet.values() and has the additional requirement that it needs to be sorted first.
class PlotQuerySet(models.QuerySet):
def grouped_values(self, key_field, *fields, **expressions):
if key_field not in fields:
fields += (key_field,)
values = self.values(*fields, **expressions)
data = {}
for key, gen in itertools.groupby(values, lambda x: x.pop(key_field)):
data[key] = list(gen)
return data
PlotManager = models.Manager.from_queryset(PlotQuerySet, class_name='PlotManager')
class CpuReading(models.Model):
host = models.CharField(max_length=255)
reading = models.IntegerField()
created_at = models.DateTimeField(auto_now_add=True)
objects = PlotManager()
Example:
CpuReading.objects.order_by(
'host', 'created_at'
).grouped_values(
'host', 'created_at', 'reading'
)
Out[10]:
{'a': [{'created_at': datetime.datetime(2020, 7, 13, 16, 45, 23, 215005, tzinfo=<UTC>),
'reading': 0},
{'created_at': datetime.datetime(2020, 7, 13, 16, 45, 23, 223080, tzinfo=<UTC>),
'reading': 1},
{'created_at': datetime.datetime(2020, 7, 13, 16, 45, 23, 230218, tzinfo=<UTC>),
'reading': 2},
...],
'b': [{'created_at': datetime.datetime(2020, 7, 13, 16, 45, 23, 241476, tzinfo=<UTC>),
'reading': 0},
{'created_at': datetime.datetime(2020, 7, 13, 16, 45, 23, 242015, tzinfo=<UTC>),
'reading': 1},
{'created_at': datetime.datetime(2020, 7, 13, 16, 45, 23, 242537, tzinfo=<UTC>),
'reading': 2},
...]}

Related

Get new item uploaded in AWS DynamoDB table streaming to Lambda using python

I created a framework that saves information in DynamoDB table, I need the last uploaded item, the DynamoDB table looks like this:
{'Table': {'AttributeDefinitions': [{'AttributeName': 'ID',
'AttributeType': 'S'}],
'TableName': 'Bulk_query_database',
'KeySchema': [{'AttributeName': 'ID', 'KeyType': 'HASH'}],
'TableStatus': 'ACTIVE',
'CreationDateTime': datetime.datetime(2022, 10, 6, 21, 58, 20, 293000, tzinfo=tzlocal()),
'ProvisionedThroughput': {'LastDecreaseDateTime': datetime.datetime(2022, 10, 6, 22, 8, 40, 735000, tzinfo=tzlocal()),
'NumberOfDecreasesToday': 0,
'ReadCapacityUnits': 1,
'WriteCapacityUnits': 1},
'TableSizeBytes': 59,
'ItemCount': 1,
So far now I have connect DynamoDB stream as a trigger for a Lambda function, this fuction should print the last element inserted in the table The query im using is this:
dynamodb_resource = boto3.resource('dynamodb', region_name="us-east-1")
table = dynamodb_resource.Table('Bulk_query_database')
response = table.query(KeyConditionExpression=Key('ID').eq('I CANT MAKE IT WORK UNLESS I USE THE ID STRING'))
items = response['Items']
print(items)
There is no response using this query, what I can do to get last element loaded into the table.
Your question isn't so clear. From my understanding you want to Query the item which is stored in DynamoDB highlighted by the image you shared. To do this you need to set the value of ID:
response = table.query(KeyConditionExpression=Key('ID').eq('3P3AD596A'))
items = response['Items']
In DynamoDB items are stored based on the key, in your case ID and to retrieve those items you need to do so by passing the key value to your requests.

Pydantic: JSON-encoding dictionary keys

When using (hashable) objects as dictionary keys, calling .json() fails because while the values are encoded, the keys aren't:
from pydantic import BaseModel
from typing import dict
from datetime import datetime
class Foo(BaseModel):
date: datetime
sdict: Dict[datetime, str]
class Config:
json_encoders = {
datetime: repr
}
foo = Foo(date=datetime.now(), sdict={datetime.now(): 'now'})
foo
# Foo(date=datetime.datetime(2021, 9, 3, 12, 9, 55, 36105), sdict={datetime.datetime(2021, 9, 3, 12, 9, 55, 36114): 'now'})
foo.json()
TypeError: keys must be a string
# to prove the other way around works:
class Foo(BaseModel):
date: datetime
sdict: Dict[str, datetime]
class Config:
json_encoders = {
datetime: repr
}
foo = Foo(date=datetime.now(), sdict={'now': datetime.now()})
foo.json()
# '{"date": "datetime.datetime(2021, 9, 3, 12, 13, 30, 606880)", "sdict": {"now": "datetime.datetime(2021, 9, 3, 12, 13, 30, 606884)"}}'
This is because the default= param in json.dumps() which is ultimately used to dump doesn't encode dictionary keys. Defining a JSON encoder class does work, but it doesn't work for me for other reasons.
I've seen TypedDict in pydantic but it doesn't seem to fix the issue. Actually, I'm unsure what's the use of TypedDict, since AFAICS you need to define every key in the dict, which makes it analogue to a static object?
My use-case is that I need to represent the following idea:
{
"report": {
"warehouses": {
warehouse.id: {
"name": warehouse.name,
"address": warehouse.address,
}
for warehouse in warehouses
}
}
and warehouse.id is an Identifier object which can convert to different formats on demand, and which the json encoder will convert to a string.
Anyone knows of a way other than a dictionary where I can add arbitrary keys to an object in a way that will be affected by the json encoder, or some other way of serializing?
One of the options of solving the problem is using custom json_dumps function for pydantic model, inside which to make custom serialization, I did it by inheriting from JSONEncoder.
For example, like this:
import json
from pydantic import BaseModel
from typing import Dict
from datetime import datetime
class CustomEncoder(json.JSONEncoder):
def _transform(self, v):
res = v
if isinstance(v, datetime):
res = v.isoformat()
# else other variants
return self._encode(res)
def _encode(self, obj):
if isinstance(obj, dict):
return {self._transform(k): self._transform(v) for k, v in obj.items()}
else:
return obj
def encode(self, obj):
return super(CustomEncoder, self).encode(self._encode(obj))
def custom_dumps(values, *, default):
return CustomEncoder().encode(values)
class Foo(BaseModel):
date: datetime
sdict: Dict[datetime, str]
class Config:
json_dumps = custom_dumps
foo = Foo(date=datetime.now(), sdict={datetime.now(): 'now'})
Foo(date=datetime(2021, 9, 3, 12, 9, 55, 36105), sdict={datetime(2021, 9, 3, 12, 9, 55, 36114): 'now'})
print(foo.json())
{"date": "2021-09-07T16:02:51.070159", "sdict": {"2021-09-07T16:02:51.070164": "now"}}

AWS Config Python Unicode Mess

I am running into issue with trying to pull out usable items from this output. I am just trying to pull a single value from this string of Unicode and it has been super fun.
my print(response) returns this: FYI this is way longer than this little snippet.
{u'configurationItems': [{u'configurationItemCaptureTime': datetime.datetime(2020, 6, 4, 21, 56, 31, 134000, tzinfo=tzlocal()), u'resourceCreationTime': datetime.datetime(2020, 5, 22, 16, 32, 55, 162000, tzinfo=tzlocal()), u'availabilityZone': u'Not Applicable', u'awsRegion': u'us-east-1', u'tags': {u'brassmonkeynew': u'tomtagnew'}, u'resourceType': u'AWS::DynamoDB::Table', u'resourceId': u'tj-test2', u'configurationStateId': u'1591307791134', u'relatedEvents': [], u'relationships': [], u'arn': u'arn:aws:dynamodb:us-east-1:896911201517:table/tj-test2', u'version': u'1.3', u'configurationItemMD5Hash': u'', u'supplementaryConfiguration': {u'ContinuousBackupsDescription': u'{"continuousBackupsStatus":"ENABLED","pointInTimeRecoveryDescription":{"pointInTimeRecoveryStatus":"DISABLED"}}', u'Tags': u'[{"key":"brassmonkeynew","value":"tomtagnew"}]'}, u'resourceName': u'tj-test2', u'configuration': u'{"attributeDefinitions":[{"attributeName":"tj-test2","attributeType":"S"}],"tableName":"tj-test2","keySchema":[{"attributeName":"tj-test2","keyType":"HASH"}],"tableStatus":"ACTIVE","creationDateTime":1590165175162,"provisionedThroughput":{"numberOfDecreasesToday":0,"readCapacityUnits":5,"writeCapacityUnits":5},"tableArn":"arn:aws:dynamodb:us-east-1:896911201517:table/tj-test2","tableId":"816956d7-95d1-4d31-8d18-f11b18de4643"}', u'configurationItemStatus': u'OK', u'accountId': u'896911201517'}, {u'configurationItemCaptureTime': datetime.datetime(2020, 6, 1, 16, 27, 21, 316000, tzinfo=tzlocal()), u'resourceCreationTime': datetime.datetime(2020, 5, 22, 16, 32, 55, 162000, tzinfo=tzlocal()), u'availabilityZone': u'Not Applicable', u'awsRegion': u'us-east-1', u'tags': {u'brassmonkeynew': u'tomtagnew', u'backup-schedule': u'daily'}, u'resourceType': u'AWS::DynamoDB::Table', u'resourceId': u'tj-test2', u'configurationStateId': u'1591028841316', u'relatedEvents': [], u'relationships': [], u'arn': u'arn:aws:dynamodb:us-east-1:896911201517:table/tj-test2', u'version': u'1.3', u'configurationItemMD5Hash': u'', u'supplementaryConfiguration': {u'ContinuousBackupsDescription': u'{"continuousBackupsStatus":"ENABLED","pointInTimeRecoveryDescription":{"pointInTimeRecoveryStatus":"DISABLED"}}', u'Tags': u'[{"key":"brassmonkeynew","value":"tomtagnew"},{"key":"backup-schedule","value":"daily"}]'}, u'resourceName': u'tj-test2', u'configuration': u'{"attributeDefinitions":[{"attributeName":"tj-test2","attributeType":"S"}],"tableName":"tj-test2","keySchema":[{"attributeName":"tj-
and so on. I have tried a few different ways of getting this info but every time I get a key error:
I also tried converting this into JSON and but since i have Date/time at the top it gives me this error:
“TypeError: [] is not JSON serializable
Failed attempts:
# print(response[0]["tableArn"])
print(response2)
print(response2['tableArn'])
print(response2.arn)
print(response2['configurationItems'][0]['tableArn'])
print(response2['configurationItems']['tableArn'])
print(response.configurationItems[0])
arn = response.configurationItems[0].arn
def lambda_handler(event, context):
# print("Received event: " + json.dumps(event, indent=2))
message = event['Records'][0]['Sns']['Message']
print("From SNS: " + message)
response = client.get_resource_config_history(
resourceType='AWS::DynamoDB::Table',
resourceId = message
)
response2 = dict(response)
print(response)
return message
Here's some Python3 code that shows how to access the elements:
import boto3
import json
import pprint
config_client = boto3.client('config')
response = config_client.get_resource_config_history(
resourceType='AWS::DynamoDB::Table',
resourceId = 'stack-table'
)
for item in response['configurationItems']:
configuration = item['configuration'] # Returns a JSON string
config = json.loads(configuration) # Convert to Python object
pprint.pprint(config) # Show what's in it
print(config['tableArn']) # Access elements in object
The trick is that the configuration field contains a JSON string that needs to be converted into a Python object for easy access.

Django: annotate Sum Case When depending on the status of a field

In my application i need to get all transactions per day for the last 30 days.
In transactions model i have a currency field and i want to convert the value in euro if the chosen currency is GBP or USD.
models.py
class Transaction(TimeMixIn):
COMPLETED = 1
REJECTED = 2
TRANSACTION_STATUS = (
(COMPLETED, _('Completed')),
(REJECTED, _('Rejected')),
)
user = models.ForeignKey(CustomUser)
status = models.SmallIntegerField(choices=TRANSACTION_STATUS, default=COMPLETED)
amount = models.DecimalField(default=0, decimal_places=2, max_digits=7)
currency = models.CharField(max_length=3, choices=Core.CURRENCIES, default=Core.CURRENCY_EUR)
Until now this is what i've been using:
Transaction.objects.filter(created__gte=last_month, status=Transaction.COMPLETED)
.extra({"date": "date_trunc('day', created)"})
.values("date").annotate(amount=Sum("amount"))
which returns a queryset containing dictionaries with date and amount:
<QuerySet [{'date': datetime.datetime(2018, 6, 19, 0, 0, tzinfo=<UTC>), 'amount': Decimal('75.00')}]>
and this is what i tried now:
queryset = Transaction.objects.filter(created__gte=last_month, status=Transaction.COMPLETED).extra({"date": "date_trunc('day', created)"}).values('date').annotate(
amount=Sum(Case(When(currency=Core.CURRENCY_EUR, then='amount'),
When(currency=Core.CURRENCY_USD, then=F('amount') * 0.8662),
When(currency=Core.CURRENCY_GBP, then=F('amount') * 1.1413), default=0, output_field=FloatField()))
)
which is converting gbp or usd to euro but it creates 3 dictionaries with the same day instead of making the sum of them.
This is what it returns: <QuerySet [{'date': datetime.datetime(2018, 6, 19, 0, 0, tzinfo=<UTC>), 'amount': 21.655}, {'date': datetime.datetime(2018, 6, 19, 0, 0, tzinfo=<UTC>), 'amount': 28.5325}, {'date': datetime.datetime(2018, 6, 19, 0, 0, tzinfo=<UTC>), 'amount': 25.0}]>
and this is what i want:
<QuerySet [{'date': datetime.datetime(2018, 6, 19, 0, 0, tzinfo=<UTC>), 'amount': 75.1875}]>
The only thing that remains is an order_by. This will (yeah, I know that sounds strange), force Django to perform a GROUP BY. So it should be rewritten to:
queryset = Transaction.objects.filter(
created__gte=last_month,
status=Transaction.COMPLETED
).extra(
{"date": "date_trunc('day', created)"}
).values(
'date'
).annotate(
amount=Sum(Case(
When(currency=Core.CURRENCY_EUR, then='amount'),
When(currency=Core.CURRENCY_USD, then=F('amount') * 0.8662),
When(currency=Core.CURRENCY_GBP, then=F('amount') * 1.1413),
default=0,
output_field=FloatField()
))
).order_by('date')
(I here fixed the formatting a bit to make it more readable, especially for small screens, but it is (if we ignore spacing) the same as in the question, except for .order_by(..) of course.)
We need to aggregate the query set to accomplish what you are trying.
Try using aggregate()
queryset = Transaction.objects.filter(created__gte=last_month, status=Transaction.COMPLETED).extra({"date": "date_trunc('day', created)"}).values('date').aggregate(
amount=Sum(Case(When(currency=Core.CURRENCY_EUR, then='amount'),
When(currency=Core.CURRENCY_USD, then=F('amount') * 0.8662),
When(currency=Core.CURRENCY_GBP, then=F('amount') * 1.1413), default=0, output_field=FloatField())))
for more info: aggregate()

How to improve performance of pymongo queries

I inherited an old Mongo database. Let's focus on the following two collections (removed most of their content for better readability):
Collection user
db.user.find_one({"email": "user#host.com"})
{'lastUpdate': datetime.datetime(2016, 9, 2, 11, 40, 13, 160000),
'creationTime': datetime.datetime(2016, 6, 23, 7, 19, 10, 6000),
'_id': ObjectId('576b8d6ee4b0a37270b742c7'),
'email': 'user#host.com' }
Collections entry (one user to many entries):
db.entry.find_one({"userId": _id})
{'date_entered': datetime.datetime(2015, 2, 7, 0, 0),
'creationTime': datetime.datetime(2015, 2, 8, 14, 41, 50, 701000),
'lastUpdate': datetime.datetime(2015, 2, 9, 3, 28, 2, 115000),
'_id': ObjectId('54d775aee4b035e584287a42'),
'userId': '576b8d6ee4b0a37270b742c7',
'data': 'test'}
As you can see, there is no DBRef between the two.
What I would like to do is to count the total number of entries, and the number of entries updated after a given date.
To do this I used Python's pymongo library. The code below gets me what I need, but it is painfully slow.
from pymongo import MongoClient
client = MongoClient('mongodb://foobar/')
db = client.userdata
# First I need to fetch all user ids. Otherwise db cursor will time out after some time.
user_ids = [] # build a list of tuples (email, id)
for user in db.user.find():
user_ids.append( (user['email'], str(user['_id'])) )
date = datetime(2016, 1, 1)
for user_id in user_ids:
email, _id = user_id
t0 = time.time()
query = {"userId": _id}
no_of_all_entries = db.entry.find(query).count()
query = {"userId": _id, "lastUpdate": {"$gte": date}}
no_of_entries_this_year = db.entry.find(query).count()
t1 = time.time()
print("delay ", round(t1 - t0, 2))
print(email, no_of_all_entries, no_of_entries_this_year)
It takes around 0.83 second to run both db.entry.find queries on my laptop, and 0.54 on an AWS server (not the MongoDB server).
Having ~20000 users it takes painful 3 hours to get all the data.
Is that the kind of latency you'd expect to see in Mongo ? What can I do to improve this ? Bear in mind that MongoDB is fairly new to me.
Instead of running two aggregates for all users separately you can just get both aggregates for all users with db.collection.aggregate().
And instead of a (email, userId) tuples we make it a dictionary as it is easier to use to get the corresponding email.
user_emails = {str(user['_id']): user['email'] for user in db.user.find()}
date = datetime(2016, 1, 1)
entry_counts = db.entry.aggregate([
{"$group": {
"_id": "$userId",
"count": {"$sum": 1},
"count_this_year": {
"$sum": {
"$cond": [{"$gte": ["$lastUpdate", date]}, 1, 0]
}
}
}}
])
for entry in entry_counts:
print(user_emails.get(entry['_id']),
entry['count'],
entry['count_this_year'])
I'm pretty sure getting the user's email address into the result could be done but I'm not a mongo expert either.

Categories

Resources