I have a users table which has 3 types of users Student, Faculty and Club and I have a university table.
What I want is how many users are there in the specific university.
I am getting my desired output but the output is very slow.I have 90k users and the output it is generating it takes minutes to produce results.
My user model:-
from __future__ import unicode_literals
from django.db import models
from django.contrib.auth.models import User
from cms.models.masterUserTypes import MasterUserTypes
from cms.models.universities import Universities
from cms.models.departments import MasterDepartments
# WE ARE AT MODELS/APPUSERS
requestChoice = (
('male', 'male'),
('female', 'female'),
)
class Users(models.Model):
id = models.IntegerField(db_column="id", max_length=11, help_text="")
userTypeId = models.ForeignKey(MasterUserTypes, db_column="userTypeId")
universityId = models.ForeignKey(Universities, db_column="universityId")
departmentId = models.ForeignKey(MasterDepartments , db_column="departmentId",help_text="")
name = models.CharField(db_column="name",max_length=255,help_text="")
username = models.CharField(db_column="username",unique=True, max_length=255,help_text="")
email = models.CharField(db_column="email",unique=True, max_length=255,help_text="")
password = models.CharField(db_column="password",max_length=255,help_text="")
bio = models.TextField(db_column="bio",max_length=500,help_text="")
gender = models.CharField(db_column="gender",max_length=6, choices=requestChoice,help_text="")
mobileNo = models.CharField(db_column='mobileNo', max_length=16,help_text="")
dob = models.DateField(db_column="dob",help_text="")
major = models.CharField(db_column="major",max_length=255,help_text="")
graduationYear = models.IntegerField(db_column='graduationYear',max_length=11,help_text="")
canAddNews = models.BooleanField(db_column='canAddNews',default=False,help_text="")
receivePrivateMsgNotification = models.BooleanField(db_column='receivePrivateMsgNotification',default=True ,help_text="")
receivePrivateMsg = models.BooleanField(db_column='receivePrivateMsg',default=True ,help_text="")
receiveCommentNotification = models.BooleanField(db_column='receiveCommentNotification',default=True ,help_text="")
receiveLikeNotification = models.BooleanField(db_column='receiveLikeNotification',default=True ,help_text="")
receiveFavoriteFollowNotification = models.BooleanField(db_column='receiveFavoriteFollowNotification',default=True ,help_text="")
receiveNewPostNotification = models.BooleanField(db_column='receiveNewPostNotification',default=True ,help_text="")
allowInPopularList = models.BooleanField(db_column='allowInPopularList',default=True ,help_text="")
xmppResponse = models.TextField(db_column='xmppResponse',help_text="")
xmppDatetime = models.DateTimeField(db_column='xmppDatetime', help_text="")
status = models.BooleanField(db_column="status", default=False, help_text="")
deactivatedByAdmin = models.BooleanField(db_column="deactivatedByAdmin", default=False, help_text="")
createdAt = models.DateTimeField(db_column='createdAt', auto_now=True, help_text="")
modifiedAt = models.DateTimeField(db_column='modifiedAt', auto_now=True, help_text="")
updatedBy = models.ForeignKey(User,db_column="updatedBy",help_text="Logged in user updated by ......")
lastPasswordReset = models.DateTimeField(db_column='lastPasswordReset',help_text="")
authorities = models.CharField(db_column="departmentId",max_length=255,help_text="")
class Meta:
managed = False
db_table = 'users'
the query i am using which is producing the desired output but too sloq is:-
universities = Universities.objects.using('cms').all()
for item in universities:
studentcount = Users.objects.using('cms').filter(universityId=item.id,userTypeId=2).count()
facultyCount = Users.objects.using('cms').filter(universityId=item.id,userTypeId=1).count()
clubCount = Users.objects.using('cms').filter(universityId=item.id,userTypeId=3).count()
totalcount = Users.objects.using('cms').filter(universityId=item.id).count()
print studentcount,facultyCount,clubCount,totalcount
print item.name
You should use annotate to get the counts for each university and conditional expressions to get the counts based on conditions (docs)
Universities.objects.using('cms').annotate(
studentcount=Sum(Case(When(users_set__userTypeId=2, then=1), output_field=IntegerField())),
facultyCount =Sum(Case(When(users_set__userTypeId=1, then=1), output_field=IntegerField())),
clubCount=Sum(Case(When(users_set__userTypeId=3, then=1), output_field=IntegerField())),
totalcount=Count('users_set'),
)
First, an obvious optimization. In the loop, you're doing essentially the same query four times: thrice filtering for different userTypeId, and once without one. You can do this in a single COUNT(*) ... GROUP BY userTypeId query.
...
# Here, we're building a dict {userTypeId: count}
# by counting PKs over each userTypeId
qs = Users.objects.using('cms').filter(universityId=item.id)
counts = {
x["userTypeId"]: x["cnt"]
for x in qs.values('userTypeId').annotate(cnt=Count('pk'))
}
student_count = counts.get(2, 0)
faculty_count = counts.get(1, 0)
club_count = count.get(3, 0)
total_count = sum(count.values()) # Assuming there may be other userTypeIds
...
However, you're still doing 1+n queries, where n is number of universities you have in the database. This is fine if the number is low, but if it's high you need further aggregation, joining Universities and Users. A first draft I came with is something like this:
# Assuming University.name is unique, otherwise you'll need to use IDs
# to distinguish between different projects, instead of names.
qs = Users.objects.using('cms').values('userTypeId', 'university__name')\
.annotate(cnt=Count('pk').order_by('university__name')
for name, group in itertools.groupby(qs, lambda x: x["university__name"]):
print("University: %s" % name)
cnts = {g["userTypeId"]: g["cnt"] for g in group}
faculty, student, club = cnts.get(1, 0), cnts.get(2, 0), cnts.get(3, 0)
# NOTE: I'm assuming there are only few (if any) userTypeId values
# other than {1,2,3}.
total = sum(cnts.values())
print(" Student: %d, faculty: %d, club: %d, total: %d" % (
student, faculty, club, total))
I might've made a typo there, but hope it's correct. In terms of SQL, it should emit a query like
SELECT uni.name, usr.userTypeId, COUNT(usr.id)
FROM some_app_universities AS uni
LEFT JOUN some_app_users AS usr ON us.universityId = uni.id
GROUP BY uni.name, usr.userTypeId
ORDER BY uni.name
Consider reading documentation on aggregations and annotations. And be sure to check out raw SQL that Django ORM emits (e.g. use Django Debug Toolbar) and analyze how well it works on your database. For example, use EXPLAIN SELECT if you're using PostgreSQL. Depending on your dataset, you may benefit from some indexes there (e.g. on userTypeId column).
Oh, and on a side note... it's off-topic, but in Python it's a custom to have variables and attributes use lowercase_with_underscores. In Django, model class names are usually singular, e.g. User and University.
I'm having a dilemma on choosing whether to use ContentType or using ManyToManyField.
Consider the following example:
class Book(Model):
identifiers = ManyToManyField('Identifier')
title = CharField(max_length=10)
class Series(Model):
identifiers = ManyToManyField('Identifier')
book = ForeignKey('Book')
name = CharField(max_length=10)
class Author(Model):
identifiers = ManyToManyField('Identifier')
name = CharField(max_length=10)
class Identifier(Model):
id_type = ForeignKey('IdType')
value = CharField(max_length=10)
class IdType(Model):
# Sample Value:
# Book: ISBN10, ISBN13, LCCN
# Serial: ISSN
# Author: DAI, AIS
name = CharField(max_length=10)
As you notice, Identifier is being used in many places, and in fact, it is so generic that many business related object requires Identifier, similar to how TagItem from the Django examples is being used.
An alternative approach is to generalized this using the Generic Relation.
class Book(Model):
identifiers = GenericRelation(Identifier)
title = CharField(max_length=10)
authors = ManyToManyField(Author)
class Series(Model):
identifiers = GenericRelation(Identifier)
book = ForeignKey('Book')
name = CharField(max_length=10)
class Author(Model):
identifiers = GenericRelation(Identifier)
name = CharField(max_length=10)
class Identifier(Model):
content_type = models.ForeignKey(ContentType, on_delete=models.CASCADE)
object_id = models.PositiveIntegerField()
content_object = GenericForeignKey('content_type', 'object_id')
id_type = ForeignKey('IdType')
value = CharField(max_length=10)
class IdType(Model):
# Sample Value:
# Book: ISBN10, ISBN13, LCCN, MyLibrary, YourLibrary, XYZLibrary, etc.
# Serial: ISSN, XYZSerial, etc...
# Author: DAI, AIS, XYZAuthor, etc...
name = CharField(max_length=10)
I'm unsure if I'm raising the right concern regarding Generic Relation.
I'm worried about the data growth of the Identifier table, as it will grow very fast on one table, for example:
100,000 books, average 4 identifiers each. (total 400,000 identifier records)
average 2 authors per book (total 200,000 identifier records)
For each record in book, 4-6x data increased in identifier table. Soon, Identifier Table will be millions of records. Will the queries becoming very slow in the long run? Moreover, I believed that identifier is a field that being queried and used in the application.
Is this generalization correctly done? As in, Author Identifier is completely unrelated with Book Identifier and should have its own BookIdentifier and AuthorIdentifier on its own. Although they seems to have IdType.name and IdType.value pattern, but the domain are completely not related, one is author, the other is book. Should they be generalized? Why not?
What problem could there be if I were to implement under GenericRelation model?
I have the following models:
class AcademicRecord(models.Model):
record_id = models.PositiveIntegerField(unique=True, primary_key=True)
subjects = models.ManyToManyField(Subject,through='AcademicRecordSubject')
...
class AcademicRecordSubject(models.Model):
academic_record = models.ForeignKey('AcademicRecord')
subject = models.ForeignKey('Subject')
language_group = IntegerCharField(max_length=2)
...
class SubjectTime(models.Model):
time_id = models.CharField(max_length=128, unique=True, primary_key=True)
subject = models.ForeignKey(Subject)
language_group = IntegerCharField(max_length=2)
...
class Subject(models.Model):
subject_id = models.PositiveIntegerField(unique=True,primary_key=True)
...
The academic records have list of subjects each with a language code and the subject times have a subject and language code.
With a given AcademicRecord, how can I get the subject times that matches with the AcademicRecordSubjects that the AcademicRecord has?
This is my approach, but it makes more queries than needed:
# record is the given AcademicRecord
times = []
for record_subject in record.academicrecordsubject_set.all():
matched_times = SubjectTime.objects.filter(subject=record_subject.subject)
current_times = matched_times.filter(language_group=record_subject.language_group)
times.append(current_times)
I want to make the query using django ORM not with raw SQL
SubjectTime language group has to match with Subject's language group aswell
I got it, in part thanks to #Robert Jørgensgaard Eng
My problem was how to do the inner join using more than 1 field, in which the F object came on handly.
The correct query is:
SubjectTime.objects.filter(subject__academicrecordsubject__academic_record=record,
subject__academicrecordsubject__language_group=F('language_group'))
Given an AcademicRecord instance academic_record, it is either
SubjectTime.objects.filter(subject__academicrecordsubject_set__academic_record=academic_record)
or
SubjectTime.objects.filter(subject__academicrecordsubject__academic_record=academic_record)
The results reflect all the rows of the join that these ORM queries become in SQL. To avoid duplicates, just use distinct().
Now this would be much easier, if I had a django shell to test in :)
I've looked at doing a query using an extra and/or annotate but have not been able to get the result I want.
I want to get a list of Products, which has active licenses and also the total number of available licenses. An active license is defined as being not obsolete, in date, and the number of licenses less the number of assigned licenses (as defined by a count on the manytomany field).
The models I have defined are:
class Vendor(models.Model):
name = models.CharField(max_length=200)
url = models.URLField(blank=True)
class Product(models.Model):
name = models.CharField(max_length=200)
vendor = models.ForeignKey(Vendor)
product_url = models.URLField(blank=True)
is_obsolete = models.BooleanField(default=False, help_text="Is this product obsolete?")
class License(models.Model):
product = models.ForeignKey(Product)
num_licenses = models.IntegerField(default=1, help_text="The number of assignable licenses.")
licensee_name = models.CharField(max_length=200, blank=True)
license_key = models.TextField(blank=True)
license_startdate = models.DateField(default=date.today())
license_enddate = models.DateField(null=True, blank=True)
is_obsolete = models.BooleanField(default=False, help_text="Is this licenses obsolete?")
licensees = models.ManyToManyField(User, blank=True)
I have tried filtering by the License model. Which works, but I don't know how to then collate / GROUP BY / aggregate the returned data into a single queryset that is returned.
When trying to filter by procuct, I can quite figure out the query I need to do. I can get bits and pieces, and have tried using a .extra() select= query to return the number of available licenses (which is all I really need at this point) of which there will be multiple licenses associated with a product.
So, the ultimate answer I am after is, how can I retrieve a list of available products with the number of available licenses in Django. I'd rather not resort to using raw as much as possible.
An example queryset that gets all the License details I want, I just can't get the product:
License.objects.annotate(
used_licenses=Count('licensees')
).extra(
select={
'avail_licenses': 'licenses_license.num_licenses - (SELECT count(*) FROM licenses_license_licensees WHERE licenses_license_licensees.license_id = licenses_license.id)'
}
).filter(
is_obsolete=False,
num_licenses__gt=F('used_licenses')
).exclude(
license_enddate__lte=date.today()
)
Thank you in advance.
EDIT (2014-02-11):
I think I've solved it in possibly an ugly way. I didn't want to make too many DB calls if I can, so I get all the information using a License query, then filter it in Python and return it all from inside a manager class. Maybe an overuse of Dict and list. Anyway, it works, and I can expand it with additional info later on without a huge amount of risk or custom SQL. And it also uses some of the models parameters that I have defined in the model class.
class LicenseManager(models.Manager):
def get_available_products(self):
licenses = self.get_queryset().annotate(
used_licenses=Count('licensees')
).extra(
select={
'avail_licenses': 'licenses_license.num_licenses - (SELECT count(*) FROM licenses_license_licensees WHERE licenses_license_licensees.license_id = licenses_license.id)'
}
).filter(
is_obsolete=False,
num_licenses__gt=F('used_licenses')
).exclude(
license_enddate__lte=date.today()
).prefetch_related('product')
products = {}
for lic in licenses:
if lic.product not in products:
products[lic.product] = lic.product
products[lic.product].avail_licenses = lic.avail_licenses
else:
products[lic.product].avail_licenses += lic.avail_licenses
avail_products = []
for prod in products.values():
if prod.avail_licenses > 0:
avail_products.append(prod)
return avail_products
EDIT (2014-02-12):
Okay, this is the final solution I have decided to go with. Uses Python to filter the results. Reduces cache calls, and has a constant number of SQL queries.
The lesson here is that for something with many levels of filtering, it's best to get as much as needed, and filter in Python when returned.
class ProductManager(models.Manager):
def get_all_available(self, curruser):
"""
Gets all available Products that are available to the current user
"""
q = self.get_queryset().select_related().prefetch_related('license', 'license__licensees').filter(
is_obsolete=False,
license__is_obsolete=False
).exclude(
license__enddate__lte=date.today()
).distinct()
# return a curated list. Need further information first
products = []
for x in q:
x.avail_licenses = 0
x.user_assigned = False
# checks licenses. Does this on the model level as it's cached so as to save SQL queries
for y in x.license.all():
if not y.is_active:
break
x.avail_licenses += y.available_licenses
if curruser in y.licensees.all():
x.user_assigned = True
products.append(x)
return q
One strategy would be to get all the product ids from your License queryset:
productIDList = list(License.objects.filter(...).values_list(
'product_id', flat=True))
and then query the products using that list of ids:
Product.objects.filter(id__in=productIDList)
I am using django-follow to allow users to "follow" objects - in this example, Actors in films.
I am pulling back a list of film actors using
actors_user_is_following = Follow.objects.get_follows(Actor).filter(user=request.user.id)
But what I also want to do is suggest films to the user based on the actors they are following. This does not need to be a complex algorithm of what they already like and suggesting relative films, just a simple "because you follow this actor and this actor is in this film, suggest it to the user"
I have this rather clunky way of doing this right now...
context['follows'] = {
'actors': Follow.objects.get_follows(Actor).filter(user=request.user.id),
'genres': Follow.objects.get_follows(Genre).filter(user=request.user.id),
}
actor_ids = []
for actor in context['follows']['actors']:
actor_ids.append(actor.target_artist_id)
genre_ids = []
for artist in context['follows']['genres']:
genre_ids.append(artist.genre_ids)
context['suggested'] = {
'films': Listing.objects.filter(Q(actors__in=actor_ids) | Q(genres__in=genre_ids))
}
Which works, but I'm sure there is a better way of doing it?
Most importantly I also want to show the user why that film as been recommended by displaying the actors or genres it features that the user is following, so the end result might be something like...
film = {
title: 'Dodgeball'
image: '/images/films/dodgeball.jpg'
followed_actors: ['Ben Stiller', 'Vince Vaughn'] #could be multiple
followed_genres: ['Comedy'] #could be multiple
}
Note I would want to return multiple films.
Here's how my models are coded up:
Film Model defined like so:
from django.db import models
from app.actors.models import Actor
from app.genres.models import Genre
class Film(models.Model):
title = models.CharField(max_length=255)
strapline = models.CharField(max_length=255)
slug = models.SlugField(max_length=100)
image_url = models.CharField(max_length=255)
pub_date = models.DateTimeField('date published')
actors = models.ManyToManyField(Actor)
genres = models.ManyToManyField(Genre)
def __unicode__(self):
return self.title
And Actor Model:
from django.db import models
from follow import utils
class Actor(models.Model):
title = models.CharField(max_length=255)
strapline = models.CharField(max_length=255)
image = models.CharField(max_length=255)
image_hero = models.CharField(max_length=255)
bio = models.TextField()
def __unicode__(self):
return self.title
#followable
utils.register(Actor)
Behind the scenes, Follow objects are essentially a many-to-many relationship with fields added each time you register a model.
Your question just talks about actors, but your code also includes genres. It's not especially hard to cover both, I'm just not sure which way is the way you want it.
I think you can get your film objects in one queryset:
films = Film.objects.filter(Q(actors__in=Actor.objects.filter(follow_set__user=request.user)) |
Q(genres__in=Genre.objects.filter(follow_set__user=request.user))).distinct()
As noted in the docs for __in lookups, some database back ends will give you better performance if you evaluate the subqueries before using them:
actor_ids = list(Actor.objects.filter(follow_set__user=request.user).values_list('id', flat=True))
genre_ids = list(Genre.objects.filter(follow_set__user=request.user).values_list('id', flat=True))
films = Film.objects.filter(Q(actors__in=actor_ids) | Q(genres__in=genre_ids)).distinct()
If you just want to return the matching films, I think those are the most concise way to express it.
For the part where you're adding the reasons to the films - I don't see a more elegant way to handle that than to iterate through the films queryset and add the information by hand. I would definitely define the querysets for actor_ids and genre_ids before doing so, although whether or not I evaluated them early would still depend on the db back end.
annotated_films = []
for film in films:
film.followed_actors = film.actors.filter(id__in=actor_ids)
film.followed_genres = film.genres.filter(id__in=genre_ids)
annotated_films.append(film)