Django - Getting/Saving large objects takes a lot of time - python

I'm trying to get a few million of items from a model, and parsing them. However, somehow it spends a lot of time trying to get the data saved.
These are the current models that I have:
class mapt(models.Model):
s = models.IntegerField(primary_key=True)
name = models.CharField(max_length=2000)
def __unicode__(self):
return str(self.s)
class datt(models.Model):
s = models.IntegerField(primary_key=True)
setid = models.IntegerField()
var = models.IntegerField()
val = models.IntegerField()
def __unicode(self):
return str(self.s)
class sett(models.Model):
setid = models.IntegerField(primary_key=True)
block = models.IntegerField()
username = models.IntegerField()
ts = models.IntegerField()
def __unicode__(self):
return str(self.setid)
class data_parsed(models.Model):
setid = models.IntegerField(max_length=2000, primary_key=True)
block = models.CharField(max_length=2000)
username = models.CharField(max_length=2000)
data = models.CharField(max_length=200000)
time = models.IntegerField()
def __unicode__(self):
return str(self.setid)
The s parameter for the datt model should actually act as a foreign key to mapt's s parameter. Furthermore, sett's setid field should act as a foreign key to setid's setid.
Lastly, data_parsed's setid is a foreign key to sett's models.
The algorithm is currently written this way:
def database_rebuild(start_data_parsed):
listSetID = []
#Part 1
for items in sett.objects.filter(setid__gte=start_data_parsed):
listSetID.append(items.setid)
uniqueSetID = listSetID
#Part 2
for items in uniqueSetID:
try:
SetID = items
settObject = sett.objects.get(setid=SetID)
UserName = mapt.objects.get(pk=settObject.username).name
TS = pk=settObject.ts
BlockName = mapt.objects.get(pk=settObject.block).name
DataPairs_1 = []
DataPairs_2 = []
DataPairs_1_Data = []
DataPairs_2_Data = []
for data in datt.objects.filter(setid__exact=SetID):
DataPairs_1.append(data.var)
DataPairs_2.append(data.val)
for Data in DataPairs_1:
DataPairs_1_Data.append(mapt.objects.get(pk=Data).name)
for Data in DataPairs_2:
DataPairs_2_Data.append(mapt.objects.get(pk=Data).name)
assert (len(DataPairs_1) == len(DataPairs_2)), "Length not equal"
#Part 3
Serialize = []
for idx, val in enumerate(DataPairs_1_Data):
Serialize.append(str(DataPairs_1_Data[idx]) + ":PARSEABLE:" + str(DataPairs_2_Data[idx]) + ":PARSEABLENEXT:")
Serialize_Text = ""
for Data in Serialize:
Serialize_Text += Data
Data = Serialize_Text
p = data_parsed(SetID, BlockName, UserName, Data, TS)
p.save()
except AssertionError, e:
print "Error:" + str(e.args)
print "Possibly DataPairs does not have equal length"
except Exception as e:
print "Error:" + str(sys.exc_info()[0])
print "Stack:" + str(e.args)
Basically, what it does is that:
Finds all sett objects that is greater than a number
Gets the UserName, TS, and BlockName, then get all the fields in datt field that correspond to a var and val field maps to the mapt 's' field. Var and Val is basically NAME_OF_FIELD:VALUE type of relationship.
Serialize all the var and val parameters so that I could get all the parameters from var and val that is spread across the mapt table in a row in data_parsed.
The current solution does everything I would like to, however, on a Intel Core i5-4300U CPU # 1.90Ghz, it parses around 15000 rows of data daily on a celery periodic worker. I have around 3355566 rows of data at my sett table, and it will take around ~23 days to parse them all.
Is there a way to speed up the process?
============================Updated============================
New Models:
class mapt(models.Model):
s = models.IntegerField(primary_key=True)
name = models.CharField(max_length=2000)
def __unicode__(self):
return str(self.s)
class sett(models.Model):
setid = models.IntegerField(primary_key=True)
block = models.ForeignKey(mapt, related_name='sett_block')
username = models.ForeignKey(mapt, related_name='sett_username')
ts = models.IntegerField()
def __unicode__(self):
return str(self.setid)
# class sett(models.Model):
# setid = models.IntegerField(primary_key=True)
# block = models.IntegerField()
# username = models.IntegerField()
# ts = models.IntegerField()
# def __unicode__(self):
# return str(self.setid)
class datt(models.Model):
s = models.IntegerField(primary_key=True)
setid = models.ForeignKey(sett, related_name='datt_setid')
var = models.ForeignKey(mapt, related_name='datt_var')
val = models.ForeignKey(mapt, related_name='datt_val')
def __unicode(self):
return str(self.s)
# class datt(models.Model):
# s = models.IntegerField(primary_key=True)
# setid = models.IntegerField()
# var = models.IntegerField()
# val = models.IntegerField()
# def __unicode(self):
# return str(self.s)
class data_parsed(models.Model):
setid = models.ForeignKey(sett, related_name='data_parsed_setid', primary_key=True)
block = models.CharField(max_length=2000)
username = models.CharField(max_length=2000)
data = models.CharField(max_length=2000000)
time = models.IntegerField()
def __unicode__(self):
return str(self.setid)
New Parsing:
def database_rebuild(start_data_parsed, end_data_parsed):
for items in sett.objects.filter(setid__gte=start_data_parsed, setid__lte=end_data_parsed):
try:
UserName = mapt.objects.get(pk=items.username_id).name
TS = pk=items.ts
BlockName = mapt.objects.get(pk=items.block_id).name
DataPairs_1 = []
DataPairs_2 = []
DataPairs_1_Data = []
DataPairs_2_Data = []
for data in datt.objects.filter(setid_id__exact=items.setid):
DataPairs_1.append(data.var_id)
DataPairs_2.append(data.val_id)
for Data in DataPairs_1:
DataPairs_1_Data.append(mapt.objects.get(pk=Data).name)
for Data in DataPairs_2:
DataPairs_2_Data.append(mapt.objects.get(pk=Data).name)
assert (len(DataPairs_1) == len(DataPairs_2)), "Length not equal"
Serialize = []
for idx, val in enumerate(DataPairs_1_Data):
Serialize.append(str(DataPairs_1_Data[idx]) + ":PARSEABLE:" + str(DataPairs_2_Data[idx]))
Data = ":PARSEABLENEXT:".join(Serialize)
p = data_parsed(items.setid, BlockName, UserName, Data, TS)
p.save()
except AssertionError, e:
print "Error:" + str(e.args)
print "Possibly DataPairs does not have equal length"
except Exception as e:
print "Error:" + str(sys.exc_info()[0])
print "Stack:" + str(e.args)

Defining lists by appending repeadedly is very slow. Use list comprehensions or even just the list() constructor.
In python you should not join a list of strings using for loops and +=, you should use join().
But that is not the primary bottleneck here. You have a lot of objects.get()s which each takes a database roundtrip. If you didn't have milions of rows in the mapt table, you should probably just make a dictionary mapping mapt primary keys to mapt objects.
Had you defined your foreign keys as foreign keys the django orm could help you do much of this in like five queries in total. That is, instead of SomeModel.objects.get(id=some_instance.some_fk_id) you can do some_instance.some_fk (which will only hit the databse the first time you do it for each instance). You can then even get rid of the foreign key query if some_instance had been initialized as some_instance = SomeOtherModel.objects.select_related('some_fk').get(id=id_of_some_instance).
Perhaps changing the models without changing the database will work.

Related

IntegrityError for AutoField after a bulk_create()

after doing a bulk_create, I'm trying to add some elements to my table but I got an IntegrityError as the sequence still starts with 1 when using save()
Is there any way to update the sequence within Django after the bulk_create ? Or should I generate the id myself using a max filter ?
I'm using a postgre database.
Here is a sample of my code:
elements is a list of Identifiants objects (I'm trying to import the list on a "dump" database first to be sure everything is fine)
try:
Identifiants.objects.using('import-check').all().delete()
Identifiants.objects.using('import-check').bulk_create(elements)
except:
traceback.print_exc()
return False
else:
Identifiants.objects.all().delete()
Identifiants.objects.bulk_create(elements)
return True
And here's the model
class Identifiants(models.Model):
taxon = models.IntegerField(unique=True)
noms = models.TextField(blank=True, null=True)
fiche = models.IntegerField(blank=True, null=True)
sms = models.NullBooleanField()
I let Django create the pk itself
And the view related for further insertions :
elif req.method == 'POST' and req.POST['action'] == "add":
id_form = AddFormId(req.POST)
nom_form = AddFormNom(req.POST)
search_form = LightSearchForm(req.POST)
if id_form.is_valid() and nom_form.is_valid():
inst = id_form.save(commit = False)
num_fiche = Identifiants.objects.all().aggregate(Max('fiche'))['fiche__max']
num_fiche += 1
inst.fiche = num_fiche
inst.save()
values = nom_form.save(commit = False)
values.taxon = inst
values.codesyno = 0
values.save()
return redirect(reverse(details, kwargs = {'id_item' : values.id}))
I got the issue with multiple tables. I always add elements with a form.

Create error message datefield

I want to create an error message for following form:
class ExaminationCreateForm(forms.ModelForm):
class Meta:
model = Examination
fields = ['patient', 'number_of_examination', 'date_of_examination']
Models:
class Patient(models.Model):
patientID = models.CharField(max_length=200, unique=True, help_text='Insert PatientID')
birth_date = models.DateField(auto_now=False, auto_now_add=False, help_text='YYYY-MM-DD')
gender = models.CharField(max_length=200,choices=Gender_Choice, default='UNDEFINED')
class Examination(models.Model):
number_of_examination = models.IntegerField(choices=EXA_Choices)
patient = models.ForeignKey(Patient, on_delete=models.CASCADE)
date_of_examination = models.DateField(auto_now=False, auto_now_add=False, help_text='YYYY-MM-DD')
Every Patient has 2 Examinations (number of examination = Choices 1 or 2) and the error message should be activated when the date of the second examination < date of the first examination. Something like this:
Solution: `
def clean_date_of_examination(self):
new_exam = self.cleaned_data.get('date_of_examination')
try:
old_exam = Examination.objects.get(patient=self.cleaned_data.get('patient'))
except Examination.DoesNotExist:
return new_exam
if old_exam:
if old_exam.date_of_examination > new_exam:
raise forms.ValidationError("Second examination should take place after first examination")
return new_exam`
def clean_date_of_examination(self):
new_exam = self.cleaned_data.get('date_of_examination')
old_exam = Examination.objects.get(patient = self.cleaned_data.get('Patient'))
if old_exam:
if old_exam.date_of_examination > new_exam.date_of_examination:
raise forms.ValidationError("Second examination should take place after first examination")
return data
def clean_date_of_examination(self):
# Where 'data' is used?
date_of_exam = self.cleaned_data['date_of_examination']
try:
pat1 = Patient.object.get(examination__number_of_examination=1, date_of_examination=date_of_exam)
except Patiens.DoesNotExist:
# Patient 1 with given query doesn't exist. Handle it!
try:
pat2 = Patient.object.get(examination__number_of_examination=2, date_of_examination=date_of_exam)
except Patiens.DoesNotExist:
# Patient 2 with given query doesn't exist. Handle it!
if pat2.date_of_examination < pat1.date_of_examination:
raise forms.ValidationError("Second examination should take place after first examination")`
return data`

Save function in django/python duplicating entries

I'm writing a football scoring app for a local league using the same schema as the NFL's gameday DB. I created a function that will eventually run on it's own updating each player's scores.
The problem comes when the function to create a new points record is run it duplicates the entry for each player, there's no error shown or anything, everything runs as expected except for the duplicate values.
here are my views.py:
def updatepoints(request):
actual = get_object_or_404(CurrentWeek, status='active')
week = actual.week
season = actual.season
ptsexist = Puntos.objects.filter(week=week, season=season)
if ptsexist:
pts = Player.objects.raw('''SELECT DISTINCT player.player_id,(SELECT (SELECT SUM(play_player.passing_yds))+(SELECT SUM(play_player.passing_tds))+(SELECT SUM(play_player.passing_twoptm))+(SELECT SUM(play_player.passing_int))+(SELECT SUM(play_player.rushing_yds))+(SELECT SUM(play_player.rushing_tds))+(SELECT SUM(play_player.rushing_twoptm))+(SELECT SUM(play_player.fumbles_lost))+(SELECT SUM(play_player.receiving_yds))+(SELECT SUM(play_player.receiving_tds))+(SELECT SUM(play_player.receiving_twoptm))+(SELECT SUM(play_player.receiving_rec))+(SELECT SUM(play_player.kicking_fgm))+(SELECT SUM(play_player.kicking_xpmade))+(SELECT SUM(play_player.fumbles_rec_tds))+(SELECT SUM(play_player.kicking_rec_tds))) AS total,id_puntos FROM player INNER JOIN play_player ON player.player_id = play_player.player_id INNER JOIN game ON play_player.gsis_id = game.gsis_id LEFT JOIN points ON player.player_id = points.player_id AND points.temporada = game.season_year AND "DraftFantasy_puntos".semana = game.week WHERE game.week = %s AND game.season_year = %s AND game.season_type != 'Warmup' AND game.season_type != 'Postseason' GROUP BY player.player_id,points.id_points''', [week, season])
for obj in pts:
obj.id = obj.player_id
obj.points = obj.total
obj.idpoints = obj.id_points
form = UpdatePointsForm(request.POST)
pointsf = form.save(commit=False)
pointsf.id_points = obj.idpoints
pointsf.player_id = obj.player_id
pointsf.temporada = season
pointsf.semana = week
pointsf.puntos_ppr = obj.total
pointsf.save()
return HttpResponseRedirect("/dashboard/")
else:
return HttpResponseRedirect("/savepoints/")
def savepoints(request):
actual = get_object_or_404(CurrentWeek, status='active')
week = actual.week
season = actual.season
ptsn = Player.objects.raw('''SELECT DISTINCT player.player_id,(SELECT (SELECT SUM(play_player.passing_yds))+(SELECT SUM(play_player.passing_tds))+(SELECT SUM(play_player.passing_twoptm))+(SELECT SUM(play_player.passing_int))+(SELECT SUM(play_player.rushing_yds))+(SELECT SUM(play_player.rushing_tds))+(SELECT SUM(play_player.rushing_twoptm))+(SELECT SUM(play_player.fumbles_lost))+(SELECT SUM(play_player.receiving_yds))+(SELECT SUM(play_player.receiving_tds))+(SELECT SUM(play_player.receiving_twoptm))+(SELECT SUM(play_player.receiving_rec))+(SELECT SUM(play_player.kicking_fgm))+(SELECT SUM(play_player.kicking_xpmade))+(SELECT SUM(play_player.fumbles_rec_tds))+(SELECT SUM(play_player.kicking_rec_tds))) AS total FROM player INNER JOIN play_player ON player.player_id = play_player.player_id INNER JOIN game ON play_player.gsis_id = game.gsis_id WHERE game.week = %s AND game.season_year = %s AND game.season_type != 'Warmup' AND game.season_type != 'Postseason' GROUP BY player.player_id''', [week, season])
for obj in ptsn:
obj.id = obj.player_id
obj.points = obj.total
formn = PointsForm(request.POST)
pointsfn = formn.save(commit=False)
pointsfn.player_id = obj.player_id
pointsfn.temporada = season
pointsfn.semana = week
pointsfn.points = obj.total
pointsfn.save()
return HttpResponseRedirect("/ligas/")
the forms.py:
class PointsForm(forms.ModelForm):
class Meta:
model = Points
exclude = ["player_id",
"season",
"week",
"puntos"]
class UpdatePointsForm(forms.ModelForm):
class Meta:
model = Points
exclude = ["id_points",
"player_id",
"season",
"week",
"points"]
and the models.py:
class Points(models.Model):
id_points = models.AutoField(primary_key=True, null=False, max_length=15)
player_id = models.CharField(max_length=100)
season = models.IntegerField(max_length=10)
week = models.IntegerField(max_length=10)
puntos = models.IntegerField(max_length=50)
class CurrentWeek(models.Model):
id_week = models.AutoField(primary_key=True, null=False, max_length=15)
week = models.IntegerField(max_length=10)
season = models.IntegerField(max_length=5)
status = models.CharField(max_length=50, default="done")
I'm really stumped so any help will be much appreciated.

Iterating with Django ORM through large datasets is slow

I'm using Django ORM to get data out of a database with a few million items. However, computation takes a while (40 minutes+), and I'm not sure how to pin point where the issue is located.
Models I've used:
class user_chartConfigurationData(models.Model):
username_chartNum = models.ForeignKey(user_chartConfiguration, related_name='user_chartConfigurationData_username_chartNum')
openedConfig = models.ForeignKey(user_chartConfigurationChartID, related_name='user_chartConfigurationData_user_chartConfigurationChartID')
username_selects = models.CharField(max_length=200)
blockName = models.CharField(max_length=200)
stage = models.CharField(max_length=200)
variable = models.CharField(max_length=200)
condition = models.CharField(max_length=200)
value = models.CharField(max_length=200)
type = models.CharField(max_length=200)
order = models.IntegerField()
def __unicode__(self):
return str(self.username_chartNum)
order = models.IntegerField()
class data_parsed(models.Model):
setid = models.ForeignKey(sett, related_name='data_parsed_setid', primary_key=True)
setid_hash = models.CharField(max_length=100, db_index = True)
block = models.CharField(max_length=2000, db_index = True)
username = models.CharField(max_length=2000, db_index = True)
time = models.IntegerField(db_index = True)
time_string = models.CharField(max_length=200, db_index = True)
def __unicode__(self):
return str(self.setid)
class unique_variables(models.Model):
setid = models.ForeignKey(sett, related_name='unique_variables_setid')
setid_hash = models.CharField(max_length=100, db_index = True)
block = models.CharField(max_length=200, db_index = True)
stage = models.CharField(max_length=200, db_index = True)
variable = models.CharField(max_length=200, db_index = True)
value = models.CharField(max_length=2000, db_index = True)
class Meta:
unique_together = (("setid", "block", "variable", "stage", "value"),)
The code I'm running is looping through data_parsed, with relevant data that matches between user_chartConfigurationData and unique_variables.
#After we get the tab, we will get the configuration data from the config button. We will need the tab ID, which is chartNum, and the actual chart
#That is opened, which is the chartID.
chartIDKey = user_chartConfigurationChartID.objects.get(chartID = chartID)
for i in user_chartConfigurationData.objects.filter(username_chartNum = chartNum, openedConfig = chartIDKey).order_by('order').iterator():
iterator = data_parsed.objects.all().iterator()
#We will loop through parsed objects, and at the same time using the setid (unique for all blocks), which contains multiple
#variables. Using the condition, we can set the variable gte (greater than equal), or lte (less than equal), so that the condition match
#the setid for the data_parsed object, and variable condition
for contents in iterator:
#These are two flags, found is when we already have an entry inside a dictionary that already
#matches the same setid. Meaning they are the same blocks. For example FlowBranch and FlowPure can belong
#to the same block. Hence when we find an entry that matches the same id, we will put it in the same dictionary.
#Added is used when the current item does not map to a previous setid entry in the dictionary. Then we will need
#to add this new entry to the array of dictionary (set_of_pk_values). Otherwise, we will be adding a lot
#of entries that doesn't have any values for variables (because the value was added to another entry inside a dictionary)
found = False
added = False
storeItem = {}
#Initial information for the row
storeItem['block'] = contents.block
storeItem['username'] = contents.username
storeItem['setid'] = contents.setid
storeItem['setid_hash'] = contents.setid_hash
if (i.variable != ""):
for findPrevious in set_of_pk_values:
if(str(contents.setid) == str(findPrevious['setid'])):
try:
items = unique_variables.objects.get(setid = contents.setid, variable = i.variable)
findPrevious[variableName] = items.value
found = True
break
except:
pass
if(found == False):
try:
items = unique_variables.objects.get(setid = contents.setid, variable = i.variable)
storeItem[variableName] = items.value
added = True
except:
pass
if(found == False and added == True):
storeItem['time_string'] = contents.time_string
set_of_pk_values.append(storeItem)
I've tried to use select_related() or prefetch_related(), since it needs to go to unique_variables object and get some data, however, it still takes a long time.
Is there a better way to approach this problem?
Definitely, have a look at django_debug_toolbar. It will tell you how many queries you execute, and how long they last. Can't really live without this package when I have to optimize something =).
PS: Execution will be even slower.
edit: You may also want to enable db_index for the fields you use to filter with or index_together for more than one field. Ofc, measure the times between your changes so you make sure which option is better.

GAE Search: Relationships between documents in index

I'd like to return a result object which contains the indexed document AND other information, from another entity, with which the indexed document has a relationship.
So, let's say I have two Kinds:
class Store(BaseHandler):
store_name = ndb.StringProperty()
logo_url = ndb.StringProperty()
about_store = ndb.TextProperty()
class Product(BaseHandler):
product_name = ndb.StringProperty
store_key = ndb.KeyProperty() #Store entity which created this product.
Then, I add each new Product entity to the index, like this:
class NewProduct(BaseHandler):
def get(self, store_id):
self.render('new-product.html')
def post(self, store_id):
product_name = self.request.get('product_name')
store_key = ndb.Key('Store', store_id)
try:
p = Product(
store_key = store_key,
product_name = product_name)
p.put()
# Add p to index
p_doc = search.Document(
doc_id = str(p.key.id()),
fields = [
search.AtomField(name = 'store_id', value = str(str_id)),
search.TextField(name = 'product_name', value = e.product_name)])
index = search.Index('product_index')
index.put(p_doc)
except:
# handle error
Now, if I run a search query using...
index = search.Index('product_index')
index.search('PRODUCT_NAME')
I should be able to return all the Product documents from the index by its query string.
My question is: How do I efficiently return a result object which contains both the product document AND its Store kind information (store_name, logo_url, about_store)?

Categories

Resources