My django query is very slow in givig me data on terminal - python

I have a users table which has 3 types of users Student, Faculty and Club and I have a university table.
What I want is how many users are there in the specific university.
I am getting my desired output but the output is very slow.I have 90k users and the output it is generating it takes minutes to produce results.
My user model:-
from __future__ import unicode_literals
from django.db import models
from django.contrib.auth.models import User
from cms.models.masterUserTypes import MasterUserTypes
from cms.models.universities import Universities
from cms.models.departments import MasterDepartments
# WE ARE AT MODELS/APPUSERS
requestChoice = (
('male', 'male'),
('female', 'female'),
)
class Users(models.Model):
id = models.IntegerField(db_column="id", max_length=11, help_text="")
userTypeId = models.ForeignKey(MasterUserTypes, db_column="userTypeId")
universityId = models.ForeignKey(Universities, db_column="universityId")
departmentId = models.ForeignKey(MasterDepartments , db_column="departmentId",help_text="")
name = models.CharField(db_column="name",max_length=255,help_text="")
username = models.CharField(db_column="username",unique=True, max_length=255,help_text="")
email = models.CharField(db_column="email",unique=True, max_length=255,help_text="")
password = models.CharField(db_column="password",max_length=255,help_text="")
bio = models.TextField(db_column="bio",max_length=500,help_text="")
gender = models.CharField(db_column="gender",max_length=6, choices=requestChoice,help_text="")
mobileNo = models.CharField(db_column='mobileNo', max_length=16,help_text="")
dob = models.DateField(db_column="dob",help_text="")
major = models.CharField(db_column="major",max_length=255,help_text="")
graduationYear = models.IntegerField(db_column='graduationYear',max_length=11,help_text="")
canAddNews = models.BooleanField(db_column='canAddNews',default=False,help_text="")
receivePrivateMsgNotification = models.BooleanField(db_column='receivePrivateMsgNotification',default=True ,help_text="")
receivePrivateMsg = models.BooleanField(db_column='receivePrivateMsg',default=True ,help_text="")
receiveCommentNotification = models.BooleanField(db_column='receiveCommentNotification',default=True ,help_text="")
receiveLikeNotification = models.BooleanField(db_column='receiveLikeNotification',default=True ,help_text="")
receiveFavoriteFollowNotification = models.BooleanField(db_column='receiveFavoriteFollowNotification',default=True ,help_text="")
receiveNewPostNotification = models.BooleanField(db_column='receiveNewPostNotification',default=True ,help_text="")
allowInPopularList = models.BooleanField(db_column='allowInPopularList',default=True ,help_text="")
xmppResponse = models.TextField(db_column='xmppResponse',help_text="")
xmppDatetime = models.DateTimeField(db_column='xmppDatetime', help_text="")
status = models.BooleanField(db_column="status", default=False, help_text="")
deactivatedByAdmin = models.BooleanField(db_column="deactivatedByAdmin", default=False, help_text="")
createdAt = models.DateTimeField(db_column='createdAt', auto_now=True, help_text="")
modifiedAt = models.DateTimeField(db_column='modifiedAt', auto_now=True, help_text="")
updatedBy = models.ForeignKey(User,db_column="updatedBy",help_text="Logged in user updated by ......")
lastPasswordReset = models.DateTimeField(db_column='lastPasswordReset',help_text="")
authorities = models.CharField(db_column="departmentId",max_length=255,help_text="")
class Meta:
managed = False
db_table = 'users'
the query i am using which is producing the desired output but too sloq is:-
universities = Universities.objects.using('cms').all()
for item in universities:
studentcount = Users.objects.using('cms').filter(universityId=item.id,userTypeId=2).count()
facultyCount = Users.objects.using('cms').filter(universityId=item.id,userTypeId=1).count()
clubCount = Users.objects.using('cms').filter(universityId=item.id,userTypeId=3).count()
totalcount = Users.objects.using('cms').filter(universityId=item.id).count()
print studentcount,facultyCount,clubCount,totalcount
print item.name

You should use annotate to get the counts for each university and conditional expressions to get the counts based on conditions (docs)
Universities.objects.using('cms').annotate(
studentcount=Sum(Case(When(users_set__userTypeId=2, then=1), output_field=IntegerField())),
facultyCount =Sum(Case(When(users_set__userTypeId=1, then=1), output_field=IntegerField())),
clubCount=Sum(Case(When(users_set__userTypeId=3, then=1), output_field=IntegerField())),
totalcount=Count('users_set'),
)

First, an obvious optimization. In the loop, you're doing essentially the same query four times: thrice filtering for different userTypeId, and once without one. You can do this in a single COUNT(*) ... GROUP BY userTypeId query.
...
# Here, we're building a dict {userTypeId: count}
# by counting PKs over each userTypeId
qs = Users.objects.using('cms').filter(universityId=item.id)
counts = {
x["userTypeId"]: x["cnt"]
for x in qs.values('userTypeId').annotate(cnt=Count('pk'))
}
student_count = counts.get(2, 0)
faculty_count = counts.get(1, 0)
club_count = count.get(3, 0)
total_count = sum(count.values()) # Assuming there may be other userTypeIds
...
However, you're still doing 1+n queries, where n is number of universities you have in the database. This is fine if the number is low, but if it's high you need further aggregation, joining Universities and Users. A first draft I came with is something like this:
# Assuming University.name is unique, otherwise you'll need to use IDs
# to distinguish between different projects, instead of names.
qs = Users.objects.using('cms').values('userTypeId', 'university__name')\
.annotate(cnt=Count('pk').order_by('university__name')
for name, group in itertools.groupby(qs, lambda x: x["university__name"]):
print("University: %s" % name)
cnts = {g["userTypeId"]: g["cnt"] for g in group}
faculty, student, club = cnts.get(1, 0), cnts.get(2, 0), cnts.get(3, 0)
# NOTE: I'm assuming there are only few (if any) userTypeId values
# other than {1,2,3}.
total = sum(cnts.values())
print(" Student: %d, faculty: %d, club: %d, total: %d" % (
student, faculty, club, total))
I might've made a typo there, but hope it's correct. In terms of SQL, it should emit a query like
SELECT uni.name, usr.userTypeId, COUNT(usr.id)
FROM some_app_universities AS uni
LEFT JOUN some_app_users AS usr ON us.universityId = uni.id
GROUP BY uni.name, usr.userTypeId
ORDER BY uni.name
Consider reading documentation on aggregations and annotations. And be sure to check out raw SQL that Django ORM emits (e.g. use Django Debug Toolbar) and analyze how well it works on your database. For example, use EXPLAIN SELECT if you're using PostgreSQL. Depending on your dataset, you may benefit from some indexes there (e.g. on userTypeId column).
Oh, and on a side note... it's off-topic, but in Python it's a custom to have variables and attributes use lowercase_with_underscores. In Django, model class names are usually singular, e.g. User and University.

Related

Group by query in Django ORM

I an having a confusion on how to write a django query to get my data. I have 2 tables 'ticket' and 'ticket_details'. Below is the schema for them.
Ticket(id, name, type, user)
TicketDetails(ticket_id, message, created_time)
Note: Multiple message can be associated to one ticket id.
And ticket_id is a foreign key to the Ticket table.
I would like to fetch all the columns from both the table where only the latest message from the TicketDetails table should be picked for a particular ticket id.
Example:
Ticket
id, name, type, user
1,install, application, usr1
TicketDetails
ticket_id, message, creted_time
1, <message1>, 12:00 PM
1, <message2>, 04:00 PM
2, <message3>, 05:00 PM -->latest entry
Expected Output:
id, name, type, user, message, created_time
1, install, application, usr1, <message3>, 05:00PM
Thanks in advance
I made some assumptions about your models, you didn't provide any:
class Ticket(models.Model):
name = models.CharField(max_length=50)
type = models.CharField(max_length=50)
user = models.ForeignKey('auth.User', on_delete=models.CASCADE)
# Model names should NEVER end with "s"
class TicketDetail(models.Model):
ticket = models.ForeignKey(Ticket, on_delete=models.CASCADE)
message = models.CharField(max_length=50)
created_time = models.DateTimeField(auto_now_add=True)
You have 2 options:
you can write it in pure sql, you lose the ability to filter
sql = """
SELECT ticket.id, ticket.name, ticket.type, ticket.user_id, detail.message
FROM {ticket} ticket
LEFT JOIN (
SELECT detail.ticket_id, detail.message
FROM {detail} detail
INNER JOIN (
SELECT MAX(id) id, ticket_id
FROM {detail}
GROUP BY ticket_id
) detail_message ON detail.id = detail_message.id
) detail ON detail.ticket_id = ticket.id
""".format(ticket=Ticket._meta.db_table, detail=TicketDetail._meta.db_table)
tickets = Ticket.objects.raw(sql)
for ticket in tickets:
print(ticket.id, ticket.message)
Write it in the "django" way
latest_messages = TicketDetail.objects.values('ticket_id').annotate(id=models.Max('id')).values('id')
tickets = Ticket.objects.prefetch_related(models.Prefetch('ticketdetail_set', TicketDetail.objects.filter(id__in=latest_messages))).order_by('id')
for ticket in tickets:
print(ticket.id)
# this iteration will only ever yield 1 result.. or nothing.
for detail in ticket.ticketdetail_set.all():
print(detail.message)
Here are the tests:
# uses factoryboy and faker to fill in the data
class UserFactory(factory.django.DjangoModelFactory):
class Meta:
model = auth.models.User
django_get_or_create = ('username',)
first_name = fake.first_name()
last_name = fake.last_name()
email = factory.LazyAttribute(lambda obj: "{}.{}#gmail.com".format(obj.last_name, obj.first_name).lower())
username = factory.Sequence(lambda n: 'user' + str(n))
class SimpleTestCase(TestCase):
def setUp(self):
ticket1 = Ticket.objects.create(user=UserFactory(), type='A', name='Number 1')
TicketDetail.objects.create(ticket=ticket1, message='you wont see this')
TicketDetail.objects.create(ticket=ticket1, message='you wont see this either')
TicketDetail.objects.create(ticket=ticket1, message='YES!!')
ticket2 = Ticket.objects.create(user=UserFactory(), type='B', name='Number 2')
TicketDetail.objects.create(ticket=ticket2, message='you also wont see this')
TicketDetail.objects.create(ticket=ticket2, message='you also wont see this either')
TicketDetail.objects.create(ticket=ticket2, message='also YES!!')
def test_flatten_pure_sql(self):
sql = """
SELECT ticket.id, ticket.name, ticket.type, ticket.user_id, detail.message
FROM {ticket} ticket
LEFT JOIN (
SELECT detail.ticket_id, detail.message
FROM {detail} detail
INNER JOIN (
SELECT MAX(id) id, ticket_id
FROM {detail}
GROUP BY ticket_id
) detail_message ON detail.id = detail_message.id
) detail ON detail.ticket_id = ticket.id
""".format(ticket=Ticket._meta.db_table, detail=TicketDetail._meta.db_table)
self.assertEquals(['YES!!', 'also YES!!'], [x.message for x in Ticket.objects.raw(sql)])
def test_orm_way(self):
latest_messages = TicketDetail.objects.values('ticket_id').annotate(id=models.Max('id')).values('id')
tickets = Ticket.objects.prefetch_related(models.Prefetch('ticketdetail_set', TicketDetail.objects.filter(id__in=latest_messages))).order_by('id')
self.assertEquals(['Number 1', 'Number 2'], [x.name for x in tickets])
self.assertEquals(['YES!!'], [x.message for x in tickets[0].ticketdetail_set.all()])
self.assertEquals(['also YES!!'], [x.message for x in tickets[1].ticketdetail_set.all()])

How to change join and group by SQL to ORM in Django

I'm new in Django. So, I want to join two models which are company and client and count the number of clients for each of the company. Here the SQL
SELECT Company_company.name, count(Client_client.cid)
FROM Company_company
LEFT JOIN Client_client
ON Company_company.comid = Client_client.comid_id
GROUP BY Company_company.name;
But since in Django, we use ORM. So I'm a little bit confusing since I'm a beginner. I already refer few SQL to ORM converter website such as Django ORM and do some try and error. But, I didn't know where the problem since I want the output from the ORM to be classified into a different array. Here is my code:
labels = []
data = []
queryClientCompany = client.objects.values('comid').annotate(c=Count('cid')).values('comid__name','c')
for comp in queryClientCompany:
labels.append(comp.comid__name)
data.append(comp.c)
Here some of the relevant things in the client and company models:
class client (models.Model):
#client info
cid = models.AutoField(primary_key = True)
comid = models.ForeignKey(company,related_name='companys',
on_delete = models.DO_NOTHING,verbose_name="Company",null = True, blank = True)
class company(models.Model):
comid = models.AutoField(_('Company'),primary_key = True)
#company info
name = models.CharField(_('Company Name'),max_length = 50)
The error stated that the comid__name is not defined. So actually how to append the result? I hope someone can help me. Thank you for helping in advanced.
You should query from the opposite side to perform the LEFT OUTER JOIN between company and client (and not client and company):
from django.db.models import Count
labels = []
data = []
queryClientCompany = company.objects.annotate(
c=Count('companys__cid')
)
for comp in queryClientCompany:
labels.append(comp.name)
data.append(comp.c)
The companys part is due to the related_name='copanys', but it does not make much sense to name this relation that way. The related_name=… parameter [Django-doc] specifies how to access the Clients for a given Company, so clients is a more appropriate value for the related_name:
class client (models.Model):
cid = models.AutoField(primary_key=True)
comid = models.ForeignKey(
company,
related_name='clients',
on_delete = models.DO_NOTHING,
verbose_name="Company",
null = True,
blank = True
)
then the query is:
from django.db.models import Count
labels = []
data = []
queryClientCompany = company.objects.annotate(
c=Count('clients__cid')
)
for comp in queryClientCompany:
labels.append(comp.name)
data.append(comp.c)

Add additional fields after phone number lookup in the database in Django

I am building an app that look for each phone number in the database. If there is any duplicate, I want to grab the first phone number found as the main record for that phone number, then for the duplicate information(name, location), get each one of those fields, and add it to the main record phone number fields (name, location), separated by a semi colon.
The outcome would look like this after checking the duplicate information of the main phone number record found:
Name Location Phone number
Helene,Sandra New Yok, Boston 000-000
Please find my model below:
class Document(models.Model):
name = models.CharField(null=True, max_length=254, blank=True)
location = models.CharField(null=True, max_length=254, blank=True)
phone_number = models.CharField(null=True, max_length=254, blank=True)
I am a bit lost on to achieve the above. Any help would be much appreciated.
Below is what I have tried so far:(not working)
from django.shortcuts import render
from .models import Document
def index(request):
search_number = list(Document.objects.order_by('-created').values("phone_number").distinct().order_by()) # Dictionary list of all numbers sorted by creation data without duplicate
for x in search_number:
try:
look_up = Document.objects.values("phone_number")
list_in_dba = look_up.phone_number
x in list_in_dba['phone_number']
print("Yes")
except:
print("No")
return render(request, 'snippets/index.html')
I would start with something like this.
## this will get you all document records that have a duplicate phone-number
## and also group them by phone-number.
duplicate_phone_numbers = Document.objects.values('phone_number').\
annotate(total_items=Count('phone_number')).order_by('-total_items').filter(total_items__gt=1)
for entry in duplicate_phone_numbers:
records = Document.objects.filter(phone_number=entry.get('phone_number')
## unsure whether you want to just output the info here or
## update the actual record
all_names = ''
all_locations = ''
for x in records:
all_names += x.name + ";"
all_locations += x.location + ";"
print all_names, all_locations, entry.get('phone_number')
# to update the actual record
record = records[0]
record.name = all_names
record.location = all_locations
record.save()

Django reverse m2m query

Using the models from https://docs.djangoproject.com/en/dev/topics/db/queries/#making-queries with minor modifications:
from django.db import models
class Blog(models.Model):
name = models.CharField(max_length=100)
class Author(models.Model):
name = models.CharField(max_length=200)
joined = models.DateField()
def __str__(self):
return self.name
class Entry(models.Model):
blog = models.ForeignKey(Blog, on_delete=models.CASCADE)
headline = models.CharField(max_length=255)
authors = models.ManyToManyField(Author)
rating = models.IntegerField()
I would like to create a dictionary from Author to Entries, where the Author joined this year, and the Entry has a rating of 4 or better. The structure of the resulting dict should look like:
author_entries = {author1: [set of entries], author2: [set of entries], etc.}
while hitting the database less than 3'ish times (or at least not proportional to the number of Authors or Entries).
My first attempt (db hits == number of authors, 100 authors 100 db-hits):
res = {}
authors = Author.objects.filter(joined__year=date.today().year)
for author in authors:
res[author] = set(author.entry_set.filter(rating__gte=4))
second attempt, trying to read entries in one go:
res = {}
authors = Author.objects.filter(joined__year=date.today().year)
entries = Entry.objects.select_related().filter(rating__gte=4, authors__in=authors)
for author in authors:
res[author] = {e for e in entries if e.authors.filter(pk=author.pk)}
this one is even worse, 100 authors, 198 db-hits (the original second attempt used {e for e in entries if author in e.authors}, but Django wouldn't have it.
The only method I've found involves raw-sql (4 db-hits):
res = {}
_authors = Author.objects.filter(joined__year=date.today().year)
_entries = Entry.objects.select_related().filter(rating__gte=4, authors__in=_authors)
authors = {a.id: a for a in _authors}
entries = {e.id: e for e in _entries}
c = connection.cursor()
c.execute("""
select entry_id, author_id
from sampleapp_entry_authors
where author_id in (%s)
""" % ','.join(str(v) for v in authors.keys()))
res = {a: set() for a in _authors}
for eid, aid in c.fetchall():
if eid in entries:
res[authors[aid]].add(entries[eid])
(apologies for using string substitutions in the c.execute(..) call -- I couldn't find the syntax sqlite wanted for a where in ? call).
Is there a more Djangoesque way to do this?
I've created a git repo with the code I'm using (https://github.com/thebjorn/revm2m), the tests are in https://github.com/thebjorn/revm2m/blob/master/revm2m/sampleapp/tests.py
You can use a Prefetch-object [Django-doc] for that:
from django.db.models import Prefetch
good_ratings = Prefetch(
'entry_set',
queryset=Entry.objects.filter(rating__gte=4),
to_attr='good_ratings'
)
authors = Author.objects.filter(
joined__year=date.today().year
).prefetch_related(
good_ratings
)
Now the Author objects in authors will have an extra attribute good_ratings (the value of the to_attr of the Prefetch object) that is a preloaded QuerySet containing the Entrys with a rating greater than or equal to four.
So you can post-process these like:
res = {
author: set(author.good_ratings)
for author in authors
}
Although since the Author objects (from this QuerySet, not in general), already carry the attribute, so there is probably not much use anyway.

Django ORM for given group by SQL query with aggregation method sum and count

I have below given Django model
class ABC(models.Model):
user = models.ForeignKey(DEF)
name = models.CharField()
phone_num = models.CharField()
date = models.DateTimeField(auto_now=True)
amount = models.IntegerField()
I want to perform below query using Django ORM.
select *, sum(amount), count(date) from ABC group by phone_num;
I tried code below, but it does not work.
ABC.objects.all().annotate(count = Count("phone_num")).order_by("phone_num")
Not sure whether it possible to grub data you mentioned above ( Select *, sum(amount), count( date ) by simple order by, probab;y that's JOIN query, at least you could try variants below and perform some intersection by phone_num on ABC.all():
ABC.objects.values("phone_num").order_by().annotate(count = Count("date"), amount= Sum("amount"))
Notes:
values('phone_num') - for GROUP BY 'phone_num' clause.
order_by() - for exclusion possible default ordering which ( you could remove that order_by().
p.s.
Also try to run query below:
ABC.objects.all().values("phone_num").annotate(count = Count("date"), amount= Sum("amount"))
Update
You could do next loop to grub desired data as Django ORM solution is absent:
data = (dict(o, data=ABC.objects.filter(phone_num=o['phone_num'])[:1][0]) for o in ABC.objects
.values("phone_num")
.order_by()
.annotate(count = Count("date"), amount= Sum("amount")).all())
// know you could access your data in next way:
for item in data:
phone_num = item['phone_num']
count = item['count']
amount = item['amount']
id = item['data'].id
name = item['data'].name
// Do other staff...
Note
data formed with generator expression(comprehension)

Categories

Resources