Handle divide by zero with aggregated fields in Annotate expression

Handle divide by zero with aggregated fields in Annotate expression - python

Currently within the following query, win_rate will always default to 0 unless Lost is 0- in that case, win_rate becomes 100. How do I properly allow division of the aggregated fields while avoiding the division by zero error?
top_markets = list(opps
.annotate(name=Subquery(Market.objects.filter(id=OuterRef('market'))[:1].values('marketname')))
.order_by('name')
.values('name')
.annotate(opps=Count('id', filter=Q(datecreated__range=(start_date, end_date))),
Won=Count(
'id', filter=Q(winloss='Won') & Q(date_closed__range=(start_date, end_date))),
Lost=Count('id', filter=Q(winloss='Lost') & Q(
date_closed__range=(start_date, end_date))),
Concluded=F('Won') + F('Lost'))
)
.annotate(
win_rate=Case(
When(Won=0, then=0),
default=((F('Won')) / \
(F('Won')) + F('Lost'))) * 100
)
Edit-
Adding my model. opps is a pre-filtered query on the model Opportunity:
class Opportunity(models.Model):
name = models.CharField()
winloss = models.CharField()
market = models.ForeignKey(Market, on_delete=SET_NULL)
datecreated = models.DateTimeField(auto_now=True)

Cast it to a FloatField:
from django.db.models import Count, F, FloatField, Q
from django.db.models.functions import Cast
opps.values(name=F('market__marketname')).annotate(
opps=Count('id', filter=Q(datecreated__range=(start_date, end_date))),
Won=Count(
'id', filter=Q(winloss='Won', date_closed__range=(start_date, end_date))
),
Lost=Count(
'id', filter=Q(winloss='Lost', date_closed__range=(start_date, end_date))
),
Concluded=F('Won') + F('Lost'),
win_rate=Case(
When(
Concluded__gt=0,
then=Cast('Won', output_field=FloatField())
* 100
/ Cast('Concluded', output_field=FloatField()),
),
default=0,
output_field=FloatField(),
),
).order_by('name')
That being said, I don't see why you do this at the database side: you have the amount of won and list Opportunitys, so you can just do that at the Python/Django level. Furthermore please do not use the queryset to generate serialized data: use a serializer.

Related

Django, annotate + values duplicates records

I have a model called Location and I'm querying the model with filters that yield 4000 objects:
count = Location.objects.filter(**filters).count()
4000
there is a related Model called KPIs, each Location has many KPIs and there are 2,944,000 KPIs records.
I have a very complex query for the Location that annotates a lot of the KPIs data.
the annotations:
def contribute_annotations(self):
user = self.request.user
self.kpis = user.user_selected_kpis.get_all_kpis_qs()
kpis_names = tuple(kpi.internal_name for kpi in self.kpis)
branch_date = Subquery(BranchKPIs.objects.
filter(branch__location__id=OuterRef(ID)).
order_by('-date').
values(DATE)[:1]
)
# summing the members amount
filters_for_branch = (
Q(location_branches__prem=True) &
~Q(location_branches__branch_scores__members_count=0) &
Q(location_branches__branch_scores__date=F(BRANCH_DATE))
)
sum_of_members_prem_count = Coalesce(Sum('location_branches__branch_scores__members_count',
output_field=IntegerField(),
filter=filters_for_branch),
0)
# location kpis prefetch object
location_kpis_qs = LocationKPIs.objects.filter(date__range=month_range).only(DATE, LOCATION, *kpis_names)
prefetch_location_kpis = Prefetch(lookup=RelatedNames.LOCATION_SCORES,
queryset=location_kpis_qs,
)
assigned_members_count_of_latest = Case(When(location_scores__date=F(LATEST_DATE),
then=f'location_scores__assigned_members_count'))
members_count_of_latest = Case(When(location_scores__date=F(LATEST_DATE),
then=f'location_scores__members_count'))
# kpis annotations for Avg, Trends, and Sizing
kpis_annotations, alias_for_trends, kpis_objects = {}, {}, {}
for kpi in self.kpis:
name = kpi.internal_name
# annotating the last kpi score
kpis_annotations[name] = Case(When(location_scores__date=F('latest_date'),
then=f'location_scores__{name}'), default=0)
# annotating the kpi's month avg
alias_for_trends[f'{name}_avg'] = Coalesce(
Avg(f'location_scores__{name}',
filter=Q(location_scores__date__range=month_range), output_field=IntegerField()
),
0
)
# comparing latest score to the monthly avg in order to determine the kpi's trend
when_equal = When(**{f'{name}_avg': F(name)}, then=0)
when_trend_is_down = When(**{f'{name}_avg__gt': F(name)}, then=-1)
when_trend_is_up = When(**{f'{name}_avg__lt': F(name)}, then=1)
kpi_trend = Case(when_equal, when_trend_is_up, when_trend_is_down,
default=0, output_field=IntegerField())
# annotating the score color
when_red = When(**{f'{name}__gte': kpi.location_level_red_threshold.lower,
f'{name}__lte': kpi.location_level_red_threshold.upper},
then=1
)
when_yellow = When(**{f'{name}__gte': kpi.location_level_yellow_threshold.lower,
f'{name}__lte': kpi.location_level_yellow_threshold.upper},
then=2
)
when_green = When(**{f'{name}__gte': kpi.location_level_green_threshold.lower,
f'{name}__lte': kpi.location_level_green_threshold.upper},
then=3
)
score_type = Case(when_red, when_yellow, when_green, default=2)
# outputs kpi : {score: int, trend: int, score_type: int}
kpis_objects[name] = JSONObject(
score=F(name),
trend=kpi_trend,
score_type=score_type
)
# cases for the pin size of the location, it depends on how many members are in it
when_in_s_size = When(
Q(member_count__gte=settings.S_LOCATION_SIZE[0]) & Q(member_count__lte=settings.S_LOCATION_SIZE[-1]),
then=1)
when_in_m_size = When(
Q(member_count__gte=settings.M_LOCATION_SIZE[0]) & Q(member_count__lte=settings.M_LOCATION_SIZE[-1]),
then=2)
when_in_l_size = When(
Q(member_count__gte=settings.L_LOCATION_SIZE[0]) & Q(member_count__lte=settings.L_LOCATION_SIZE[-1]),
then=3)
when_in_xl_size = When(
Q(member_count__gte=settings.XL_LOCATION_SIZE[0]) & Q(member_count__lte=settings.XL_LOCATION_SIZE[-1]),
then=4)
location_size = Case(when_in_s_size, when_in_m_size, when_in_l_size, when_in_xl_size,
default=2,
output_field=IntegerField())
# location's address string
location_str = Concat(LOCATION__STREET, LOCATION__CITY, LOCATION__COUNTRY,
output_field=CharField())
return (
sum_of_members_prem_count, prefetch_location_kpis, assigned_members_count_of_latest, members_count_of_latest,
kpis_annotations, location_size, alias_for_trends, location_str, kpis_names, kpis_objects, branch_date)
filters = {'user': self.request.user, ACTIVE: True}
(sum_of_members_prem_count, prefetch_location_kpis, assigned_members_count_of_latest, members_count_of_latest,
kpis_annotations, location_size, alias_for_trends, location_str, kpis_names, kpis_objects, branch_date) = self.contribute_annotations()
query_set = (Location.objects.
filter(**filters).
select_related(RelatedNames.LOCATION).
prefetch_related(prefetch_location_kpis).
alias(latest_date=Max('scores__date'),
branch_date=branch_date,
**alias_for_trends,
**kpis_annotations
).
annotate(members_prem_count=sum_of_members_prem,
members_count=members_count_of_latest,
assigned_members_count=assigned_count_of_latest,
farm_latitude=Min(LOCATION__LATITUDE),
farm_longitude=Min(LOCATION__LONGITUDE),
address=location_str,
farm_size=farm_size,
latest_date=Max('farm_scores__date'),
**kpis_objects
).
values(ID, NAME, ADMIN_EMAIL, ADMIN_PHONE, MEMBERS_PREM_COUNT,
MEMBERS_COUNT, ASSIGNED_MEMBERS_COUNT, SIZE, ADDRESS,
latitude=F(LOCATION_LATITUDE), longitude=F(LOCATION_LONGITUDE), *kpis_names
)
)
this query yields 2,944,000 records, which means each for each KPI record and not Location.
I tried adding distinct calls in several ways but I either end up with:
NotImplementedError: annotate() + distinct(fields) is not implemented.
Or the query just ignores it and doesn't add distinct location objects.
the docs suggest that values and distinct don't play nice together and that probably somewhere there is an order by that breaks it.
I've looked at all the involved models, queries and subqueries and removed the order by but it still doesn't work.
I also tried adding this to the query:
query_set.query.clear_ordering(True)
query_set = query_set.order_by(ID).distinct(ID)
but this raises that NotImplementedError

Well, I'm not sure why it's like this and maybe in some cases it won't work.
But, I changed the query to the following:
query_set = (Location.objects.
filter(**filters).
select_related(RelatedNames.LOCATION).
prefetch_related(prefetch_location_kpis).
alias(latest_date=Max('scores__date'),
branch_date=branch_date,
**alias_for_trends,
**kpis_annotations
).
distinct(ID).
annotate(members_prem_count=sum_of_members_prem,
members_count=members_count_of_latest,
assigned_members_count=assigned_count_of_latest,
farm_latitude=Min(LOCATION__LATITUDE),
farm_longitude=Min(LOCATION__LONGITUDE),
address=location_str,
farm_size=farm_size,
latest_date=Max('farm_scores__date'),
**kpis_objects
).
distinct(ID)
)
and overriding Django's source code in django/db/models/sql/compiler.py
line 595
if grouping:
if distinct_fields:
raise NotImplementedError('annotate() + distinct(fields) is not implemented.')
order_by = order_by or self.connection.ops.force_no_ordering()
result.append('GROUP BY %s' % ', '.join(grouping))
if self._meta_ordering:
order_by = None
if having:
result.append('HAVING %s' % having)
params.extend(h_params)
just commented out the if distinct_fields condition
if grouping:
# if distinct_fields:
# raise NotImplementedError('annotate() + distinct(fields) is not implemented.')
order_by = order_by or self.connection.ops.force_no_ordering()
result.append('GROUP BY %s' % ', '.join(grouping))
if self._meta_ordering:
order_by = None
if having:
result.append('HAVING %s' % having)
params.extend(h_params)

How to limit top N of each group in Django ORM by using Postgres Window functions or Lateral Joins?

I have following Post, Category & PostScore Model.
class Post(models.Model):
category = models.ForeignKey('Category', on_delete=models.SET_NULL, related_name='category_posts', limit_choices_to={'parent_category': None}, blank=True, null=True)
status = models.CharField(max_length=100, choices=STATUS_CHOICES, default='draft')
deleted_at = models.DateTimeField(null=True, blank=True)
...
...
class Category(models.Model):
title = models.CharField(max_length=100)
parent_category = models.ForeignKey('self', on_delete=models.SET_NULL,
related_name='sub_categories', null=True, blank=True,
limit_choices_to={'parent_category': None})
...
...
class PostScore(models.Model):
post = models.OneToOneField(Post, on_delete=models.CASCADE, related_name='post_score')
total_score = models.DecimalField(max_digits=8, decimal_places=5, default=0)
...
...
So what i want is to write a query which returns N number of posts (Posts) of each distinct category (Category) sorted by post score (denoted by total_score column in PostScore model) in descending manner. So that i have atmost N records of each category with highest post score.
So i can achieve the above mentioned thing by the following raw query which gives me top 10 posts having highest score of each category :
SELECT *
FROM (
SELECT *,
RANK() OVER (PARTITION BY "post"."category_id"
ORDER BY "postscore"."total_score" DESC) AS "rank"
FROM
"post"
LEFT OUTER JOIN
"postscore"
ON
("post"."id" = "postscore"."post_id")
WHERE
("post"."deleted_at" IS NULL AND "post"."status" = 'accepted')
ORDER BY
"postscore"."total_score"
DESC
) final_posts
WHERE
rank <= 10
What i have achieved so far using Django ORM:
>>> from django.db.models.expressions import Window
>>> from django.db.models.functions import Rank
>>> from django.db.models import F
>>> posts = Post.objects.annotate(
rank=Window( expression=Rank(),
order_by=F('post_score__total_score').desc(),
partition_by[F('category_id')]
)). \
filter(status='accepted', deleted_at__isnull=True). \
order_by('-post_score__total_score')
which roughly evaluates to
>>> print(posts.query)
>>> SELECT *,
RANK() OVER (PARTITION BY "post"."category_id"
ORDER BY "postscore"."total_score" DESC) AS "rank"
FROM
"post"
LEFT OUTER JOIN
"postscore"
ON
("post"."id" = "postscore"."post_id")
WHERE
("post"."deleted_at" IS NULL AND "post"."status" = 'accepted')
ORDER BY
"postscore"."total_score"
DESC
So basically what is missing that i need to limit each group (i.e category) results by using “rank” alias.
Would love to know how this can be done ?
I have seen one answer suggested by Alexandr on this question, one way of achieving this is by using Subquery and in operator . Although it satisfies the above condition and outputs the right results but the query is very slow.
Anyway this would be the query if I go by Alexandr suggestions:
>>> from django.db.models import OuterRef, Subquery
>>> q = Post.objects.filter(status='accepted', deleted_at__isnull=True,
category=OuterRef('category')).order_by('-post_score__total_score')[:10]
>>> posts = Post.objects.filter(id__in=Subquery(q.values('id')))
So i am more keen in completing the above raw query (which is almost done just misses the limit part) by using window function in ORM. Also, i think this can be achieved by using lateral join so answers in this direction are also welcomed.

So I have got a workaround using RawQuerySet but the things is it returns a django.db.models.query.RawQuerySet which won't support methods like filter, exclude etc.
>>> posts = Post.objects.annotate(rank=Window(expression=Rank(),
order_by=F('post_score__total_score').desc(),
partition_by=[F('category_id')])).filter(status='accepted',
deleted_at__isnull=True)
>>> sql, params = posts.query.sql_with_params()
>>> posts = Post.objects.raw(""" SELECT * FROM ({}) final_posts WHERE
rank <= %s""".format(sql),[*params, 10],)
I'll wait for the answers which provides a solution which returns a QuerySet object instead, otherwise i have to do by this way only.

Django get values for Max of grouped data

After many trials and errors and checking similar questions, I think it worth asking it with all the details.
Here's a simple model. Let's say we have a Book model and a Reserve model that holds reservation data for each Book.
class Book(models.Model):
title = models.CharField(
'Book Title',
max_length=50
)
name = models.CharField(
max_length=250
)
class Reserve(models.Model):
book = models.ForeignKey(
Book,
on_delete=models.CASCADE
)
reserve_date = models.DateTimeField()
status = models.CharField(
'Reservation Status',
max_length=5,
choices=[
('R', 'Reserved'),
('F', 'Free')
]
)
I added a book and two reservation records to the model:
from django.utils import timezone
book_inst = Book(title='Book1')
book_inst.save()
reserve_inst = Reserve(book=book_inst, reserve_date=timezone.now(), status='R')
reserve_inst.save()
reserve_inst = Reserve(book=book_inst, reserve_date=timezone.now(), status='F')
reserve_inst.save()
My goal is to get data for the last reservation for each book. Based on other questions, I get it to this point:
from django.db.models import F, Q, Max
reserve_qs = Reserve.objects.values(
'book__title'
)
reserve_qs now has the last action for each Book, but when I add .value() it ignores the grouping and returns all the records.
I also tried filtering with F:
Reserve.objects.values(
'book__title'
).annotate(
last_action=Max('reserve_date')
).values(
).filter(
reserve_date=F('last_action')
)
I'm using Django 3 and SQLite.

By using another filter, you will break the GROUP BY mechanism. You can however simply obtain the last reservation with:
from django.db.models import F, Max
Reserve.objects.filter(
book__title='Book1'
).annotate(
book_title=F('book__title'),
last_action=Max('book__reserve__reserve_date')
).filter(
reserve_date=F('last_action')
)
or for all books:
from django.db.models import F, Max
qs = Reserve.objects.annotate(
book_title=F('book__title'),
last_action=Max('book__reserve__reserve_date')
).filter(
reserve_date=F('last_action')
).select_related('book')
Here we will thus calculate the maximum for that book. Since we here join on the same table, we thus group correctly.
This will retrieve all the last reservations for all Books that are retained after filtering. Normally that is one per Book. But if there are multiple Books with multiple Reservations with exactly the same timestamp, then multiple ones will be returned.
So we can for example print the reservations with:
for q in qs:
print(
'Last reservation for {} is {} with status {}',
q.book.title,
q.reserve_date,
q.status
)
For a single book however, it is better to simply fetch the Book object and return the .latest(..) [Django-doc] reseervation:
Book.objects.get(title='Book1').reserve_set.latest('reserve_date')

book_obj = Book.objects.get(title='Book1')
reserve_qs = book_obj.reserve_set.all()
This returns all the Reserves that contains this book.
You can get the latest object using .first or .last() or sort them.

How to run a custom aggregation on a queryset?

I have a model called LeaveEntry:
class LeaveEntry(models.Model):
date = models.DateField(auto_now=False, auto_now_add=False)
user = models.ForeignKey(
settings.AUTH_USER_MODEL,
on_delete=models.PROTECT,
limit_choices_to={'is_active': True},
unique_for_date='date'
)
half_day = models.BooleanField(default=False)
I get a set of LeaveEntries with the filter:
LeaveEntry.objects.filter(
leave_request=self.unapproved_leave
).count()
I would like to get an aggregation called total days, so where a LeaveEntry has half_day=True then it is half a day so 0.5.
What I was thinking based on the django aggregations docs was annotating the days like this:
days = LeaveEntry.objects.annotate(days=<If this half_day is True: 0.5 else 1>)

You can use django's conditional expressions Case and When (only for django 1.8+):
Keeping the order of filter() and annotate() in wind you can count the the number of days left for unapproved leaves like so:
from django.db.models import FloatField, Case, When
# ...
LeaveEntry.objects.filter(
leave_request=self.unapproved_leave # not sure what self relates to
).annotate(
days=Count(Case(
When(half_day=True, then=0.5),
When(half_day=False, then=1),
output_field=FloatField()
)
)
)

Django annotate queryset on a specific value of relational model attribute

Suppose there is a structure like this:
PARTICIPATION_STATUSES = (
(0, 'No, thanks'),
(1, 'I may attend'),
(2, 'I\'ll be there'),
)
class Model1(models.Model):
# ...
class Model2(models.Model):
status = models.PositiveIntegerField(
_('participation status'), choices=PARTICIPATION_STATUSES)
field = models.ForeignKey(Model1, related_name='model1_participation')
What I want to do is to annotate each object of Model1 with count of Model2 objects where status equals a specific value (status number is this particular example).
In my pseudo code it would look like:
queryset = Model1.objects.all()
queryset.annotate(declined=Count('model1_participation__status=0'))
queryset.annotate(not_sure=Count('model1_participation__status=1'))
queryset.annotate(accepted=Count('model1_participation__status=2'))
But I can't annotate the queryset in this way as Django doesn't resolve status=<n>.
What is the right way to achieve what I want?

If you are using Django 1.8 or above you can use Conditional Aggregations, these should work for annotate querysets.
from django.db.models import IntegerField, Case, When, Count
queryset = Model1.objects.all()
queryset = queryset.annotate(
declined=Count(
Case(When(model1_participation__status=0, then=1),
output_field=IntegerField())
),
not_sure=Count(
Case(When(model1_participation__status=1, then=1),
output_field=IntegerField())
),
accepted=Count(
Case(When(model1_participation__status=2, then=1),
output_field=IntegerField())
)
)

You can use an Exists Subquery:
from django.db.models.expressions import Exists, ExpressionWrapper, OuterRef, Subquery, Value
from django.db.models.fields import BooleanField
queryset = Model1.objects.all()
queryset.annotate(
declined=ExpressionWrapper(
Exists(Model2.objects.filter(
field=OuterRef('id'),
status=0)),
output_field=BooleanField()))),
not_sure=ExpressionWrapper(
Exists(Model2.objects.filter(
field=OuterRef('id'),
status=1)),
output_field=BooleanField()))),
accepted=ExpressionWrapper(
Exists(Model2.objects.filter(
field=OuterRef('id'),
status=2)),
output_field=BooleanField())))
)
To make it a bit more clear/reusable you can refactor into a function:
def is_status(status_code):
return ExpressionWrapper(
Exists(Model2.objects.filter(
field=OuterRef('id'),
status=status_code)),
output_field=BooleanField())))
Model1.objects.annotate(
declined=is_status(0),
not_sure=is_status(1),
accepted=is_status(2)
)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Handle divide by zero with aggregated fields in Annotate expression - python

Related

Django, annotate + values duplicates records

How to limit top N of each group in Django ORM by using Postgres Window functions or Lateral Joins?

Django get values for Max of grouped data

How to run a custom aggregation on a queryset?

Django annotate queryset on a specific value of relational model attribute

Categories

Resources