I have a large dataset with over 1m records. It has a manytomany field that causes duplicate returns on filtering.
models.py:
class Type(models.Model):
name = models.CharField(max_length=100, db_index=True)
class Catalogue(models.Model):
link = models.TextField(null=False)
image = models.TextField(null=True)
title = models.CharField(max_length=100, null=True)
city = models.CharField(db_index=True,max_length=100, null=True)
district = models.CharField(db_index=True,max_length=100, null=True)
type = models.ManyToManyField(Type, db_index=True)
datetime = models.CharField(db_index=True, max_length=100, null=True)
views.py:
last2week_q = Q(datetime__gte=last2week)
type_q = Q(type__in=intersections)
city_district_q = (Q(*[Q(city__contains=x) for x in city_district], _connector=Q.OR) |
Q(*[Q(district__contains=x) for x in city_district], _connector=Q.OR))
models.Catalogue.objects.filter(last2week_q & type_q & city_district_q).order_by('-datetime').distinct()
distinct() is too slow and I'm looking for a different solution to remove duplicates.
P.S:
I also tried to use this query instead of type_q, but it's slower than distinct! because type_ids is a very large list.
typ_ids = models.Catalogue.objects.only('type').filter(type__in=intersections).values_list('id', flat=True)
type_q = Q(id__in=typ_ids)
Related
I have two tables.
class DibbsSpiderDibbsMatchedProductFieldsDuplicate(models.Model):
nsn = models.TextField()
nsn2 = models.TextField()
cage = models.TextField()
part_number = models.TextField()
company_name = models.TextField(blank=True, null=True)
supplier = models.TextField(db_column='Supplier', blank=True, null=True) # Field name made lowercase.
cost = models.CharField(db_column='Cost', max_length=15, blank=True, null=True) # Field name made lowercase.
list_price = models.CharField(db_column='List_Price', max_length=15, blank=True, null=True) # Field name made lowercase.
gsa_price = models.CharField(db_column='GSA_Price', max_length=15, blank=True, null=True) # Field name made lowercase.
hash = models.TextField()
nomenclature = models.TextField()
technical_documents = models.TextField()
solicitation = models.CharField(max_length=32)
status = models.CharField(max_length=16)
purchase_request = models.TextField()
issued = models.DateField()
return_by = models.DateField()
file = models.TextField()
vendor_part_number = models.TextField()
manufacturer_name = models.TextField(blank=True, null=True)
product_name = models.TextField(blank=True, null=True)
unit = models.CharField(max_length=15, blank=True, null=True)
class Meta:
managed = False
db_table = 'dibbs_spider_dibbs_matched_product_fields_duplicate'
class DibbsSpiderSolicitation(models.Model):
line_items = models.IntegerField()
nsn = models.TextField()
nomenclature = models.TextField()
technical_documents = models.TextField()
purchase_request = models.TextField()
class Meta:
managed = False
db_table = 'dibbs_spider_solicitation'
What will be the equivalent django query for the inner join of two tables on the column nsn?
My views function will be like
def inner(request,nsn):
u_m = DibbsSpiderDibbsMatchedProductFieldsDuplicate.objects.filter(nsn2__icontains=id)
c_m = DibbsSpiderSolicitation.objects.filter(nsn__icontains=id)
obj = .......................
context = {'obj':obj}
return render(request,,"a.html",context)
the queryset should return the combination of two tables according to the common nsn.
the obj should return the combination of u_m and c_m. If u_m contains only one rows and c_m contains many rows then the obj must replicate the values of u_m.
You can try some of the options:
Adding foreign key constraint and use select_related as per this post
Raw query as mentioned in this stackoverflow post and another post with custom joins
3. Using IN query as per the following logic:
DibbsSpiderDibbsMatchedProductFieldsDuplicate.objects.filter(
nsn2__in=DibbsSpiderSolicitation.objects.filter(nsn__icontains='text_to_search').values('origin'))
First, great model names. Let's alias them:
DibbsSpiderDibbsMatchedProductFieldsDuplicate is Apples; DibbsSpiderSolicitation is Oranges
inner_qs = Apples.objects.all().extra(
tables=("yourapp_oranges",),
where=("yourapp_apples.nsn=yourapp_oranges.nsn",),
)
The documentation mentions that this api will be deprecated:
https://docs.djangoproject.com/en/4.0/ref/models/querysets/#extra
Option #1 - Introduce a ForeignKey (Recommended):
Under the class DibbsSpiderDibbsMatchedProductFieldsDuplicate add:
fkey = models.ForeignKey('DibbsSpiderSolicitation')
then you can easily access their join:
obj = DibbsSpiderDibbsMatchedProductFieldsDuplicate.Objects.filter(fkey__nsn).select_related()
Now it is your choice what you wish to do with nsn2
Option #2 - without a ForeigKey:
Raw SQL:
obj = DibbsSpiderDibbsMatchedProductFieldsDuplicate.objects.extra(where = ['''SELECT *
FROM DibbsSpiderSolicitation
INNER JOIN DibbsSpiderDibbsMatchedProductFieldsDuplicate
ON DibbsSpiderSolicitation.nsn = DibbsSpiderDibbsMatchedProductFieldsDuplicate.nsn2;'''])
# or
obj = DibbsSpiderDibbsMatchedProductFieldsDuplicate.objects.raw('''SELECT *
FROM DibbsSpiderSolicitation
INNER JOIN DibbsSpiderDibbsMatchedProductFieldsDuplicate
ON DibbsSpiderSolicitation.nsn = DibbsSpiderDibbsMatchedProductFieldsDuplicate.nsn2;''')
Using filter:
obj = DibbsSpiderSolicitation.objects.filter(nsn__in=DibbsSpiderDibbsMatchedProductFieldsDuplicate.objects.nsn2)
Sorry, I was not able to test any.
I've built a scraper that gets product data from different shopping websites.
When I run python scraper.py the program will print a JSON object containing all the data like this:
{ 'ebay': [ { 'advertiser': 'ebay',
'advertiser_url': 'https://rover.ebay.com/rover/1/711-53200-19255-0/1?ff3=2&toolid=10041&campid=5338482617&customid=&lgeo=1&vectorid=229466&item=302847614914',
'description': '30-Day Warranty - Free Charger & Cable - '
'Easy Returns!',
'main_image': 'https://thumbs1.ebaystatic.com/pict/04040_0.jpg',
'price': '290.0',
'title': 'Apple iPhone 8 Plus Smartphone AT&T Sprint '
'T-Mobile Verizon or Unlocked 4G LTE'}
]}
I want this data to be added to the database automatically every time I run the scraper.
Here's my database structure:
models.py
class Product(models.Model):
similarity_id = models.CharField(max_length=255, blank=True, null=True)
name = models.CharField(max_length=255, blank=True, null=True)
url = models.SlugField(blank=True, unique=True, allow_unicode=True)
advertiser_url = models.TextField(blank=True, null=True)
main_image = models.TextField(blank=True, null=True)
second_image = models.TextField(blank=True, null=True)
third_image = models.TextField(blank=True, null=True)
old_price = models.FloatField(default=0.00)
price = models.FloatField(default=0.00)
discount = models.FloatField(default=0.00)
currency = models.CharField(max_length=255, default="$")
description = models.TextField(blank=True, null=True)
keywords = models.CharField(max_length=255, blank=True, null=True)
asin = models.CharField(max_length=80, blank=True, null=True)
iban = models.CharField(max_length=255, blank=True, null=True)
sku = models.CharField(max_length=255, blank=True, null=True)
seller = models.CharField(max_length=255, blank=True, null=True)
free_shipping = models.BooleanField(default=False)
in_stock = models.BooleanField(default=True)
sold_items = models.IntegerField(default=0)
likes_count = models.IntegerField(default=0)
category = models.CharField(max_length=255, blank=True, null=True)
sub_category = models.CharField(max_length=255, blank=True, null=True)
reviews_count = models.IntegerField(default=0)
rating = models.FloatField(default=0)
active = models.BooleanField(default=True)
is_prime = models.BooleanField(default=False)
created_on = models.DateTimeField(auto_now_add=True)
advertiser = models.CharField(max_length=255, blank=True, null=True)
objects = ProductManager()
class Meta:
verbose_name_plural = "products"
def __str__(self):
return self.name
Add this to scrapper.py:
import path.to.model
product = Product()
product.<key> = <value> #Where key is the field and value is the value you need to fill
and after you assign every field, add
product.save()
Trick
If all the keys in the json response match the fields in the model, you can do:
for k, v in response.items():
setattr(product, k, v)
product.save()
That will save you a lot of lines and time :)
I work with json a lot; I have API caching where I receive a lot of json-based API data and I want to store it in a database for querying and caching. If you use postgres (for instance), you will see that if has extensions for json. This means that you can save json data in a special json field. But better, there are sql extensions that let you run queries on the json data. That is, postgres has "no sql" capabilities. This lets you work with json natively. I find it very compelling and I recommend it highly. It is a learning curve because it uses non-traditional sql, but heck, we have stackoverflow.
see: https://django-postgres-extensions.readthedocs.io/en/latest/json.html
here is a little example:
product_onhand_rows = DearCache.objects.filter(
object_type=DearObjectType.PRODUCT_AVAILABILITY.value).filter(
dear_account_id=self.dear_api.account_id).filter(jdata__Location=warehouse).filter(jdata__SKU=sku).all()
in this example, I have the json stored in a field jdata.
jdata__Location accesses the key Location in the json.
It nests and so on. For advanced queries, I resort to sql
select object_type,last_modified, jdata
from cached_dear_dearcache
where object_type = 'orders'
and jdata->>'Status' in ('ESTIMATING','ESTIMATED')
order by last_modified;
and there's more, you can 'unroll' lists (this is what I would call a complicated example, my json has lists of invoices, each of which has a list of lines...)
/* 1. listing invoice lines. We have to iterate over the array of invoices to get each invoice, and then inside the invoice object find the array of lines */
select object_type,last_modified, jsonb_array_elements(jsonb_array_elements(cached_dear_dearcache.jdata#>'{Invoices}')->'Lines') as lines,
jsonb_array_elements(cached_dear_dearcache.jdata#>'{Invoices}')->'InvoiceDate' as invoice_date,
jsonb_array_elements(cached_dear_dearcache.jdata#>'{Invoices}')->'InvoiceNumber' as invoice_number
from cached_dear_dearcache
where object_type = 'orders' order by last_modified;
Your approach is to convert the json data to a traditional sql model. That will work too. It's not very flexible ... if the json "schema" changes, your database schema may need to change. Philosophically, I think it is better to go with the flow, and use the json extensions, this is the best of both worlds. Performance is good, by the way.
I have the following piece of code. I'm using it to return json so Datatables can render it. I pass in query parameters.
def map_query(type_, type_model, type_pk, query_data, request):
type_results_query = None
problem_data = get_model_query_data(query_data, Problem)
problems_filtered = Problem.objects.filter(**problem_data)
if request.POST:
model_query_data = get_model_query_data(query_data, type_model)
type_results_query = Chgbk.objects.filter(**model_query_data)
print(type_results_query)
return type_results_query
So type_results_query returns data I want. But Problem model has a foreign key on it which links to key on table. I want to get the data from the Problem table into the Chgbk query as well, sort of the two objects merged but I cannot figure out how to do this and it is driving me crazy.
Models would be:
class Chgbk(VNCModel):
chgbk_id = models.IntegerField(primary_key=True)
facility = models.ForeignKey('Facility', models.DO_NOTHING)
create_dt = models.DateTimeField(blank=True, null=True)
mod_dt = models.DateTimeField(blank=True, null=True)
carrier_scac = models.CharField(max_length=25, blank=True, null=True)
carrier_name = models.CharField(max_length=25, blank=True, null=True)
class Problem(VNCModel):
problem_id = models.IntegerField(primary_key=True)
chgbk = models.ForeignKey(Chgbk, models.DO_NOTHING, blank=True, null=True)
I have 100k records in both model 'A' and in model 'B'
Ex:
class A(models.Model):
user_email = models.EmailField(null=True, blank=True)
user_mobile = models.CharField(max_length=30, null=True, blank=True)
book_id = models.CharField(max_length=255, null=True, blank=True)
payment_gateway_response = JSONField(blank=True, null=True)
class B(models.Model):
order = models.ForeignKey(A, null=True, blank=True)
pay_id = models.CharField(max_length=250, null=True, blank=True)
user_email = models.EmailField(null=True, blank=True)
user_mobile = models.CharField(max_length=30, null=True, blank=True)
created = models.DateTimeField(blank=True, null=True)
total_payment = models.DecimalField(decimal_places=3, max_digits=20, blank=True, null=True)
I want to get B's objects using A's values
for example
all_a = A.objects.all()
for a in all_a:
b = B.objects.filter(user_email=a.user_email, user_mobile=a.user_mobile)
This is fine, I am getting the results. But as it's 100k records it's taking too much time. for loop iteration is taking time. Is there any faster way to do this in django?
You can get a list of each value in a and filter b with those values.
a = A.objects.all()
emails = list(a.values_list('user_email', flat=True))
mobiles = list(a.values_list('user_mobile', flat=True))
b = B.objects.filter(user_email__in=emails, user_mobile__in=mobiles)
How ever results may have pair of email and mobile that are not pair in A. But if you make sure that emails and mobiles will be unique in A and the email and mobile in each B are based in one of the A' models, then you won't have any problems.
If you're not interested in caching the A model, you may have a performance increase using iterator() (see for reference https://docs.djangoproject.com/en/1.11/ref/models/querysets/#iterator):
for a in A.objects.all().iterator():
b = B.objects.filter(user_email=a.user_email, user_mobile=a.user_mobile)
You can do
import operator
from django.db.models import Q
q = A.objects.all().values('user_email', 'user_mobile')
B.objects.filter(reduce(operator.or_, [Q(**i) for i in q]))
If you want to do with some operations with every b object depends on a.This is not the way.
I have 2 models:
PR_Components (models.Model):
companyID = models.ForeignKey(PO_Company, blank=True, null=True)
comp_nr = models.CharField (max_length=5, blank=True, null=True)
def __str__(self):
return self.comp_nr
PR_ComponentsData (models.Model):
compID = models.ForeignKey (PR_Components, blank=False, null=True)
valid = models.DateField (max_length=10, blank=False, null=True)
comp_image = models.ImageField (upload_to="/images", blank=True, null=True)
comp_text = models.CharField (max_length=30, blank=False, null=True)
....
I want to show now in a selectbox the components number (PR_Components.comp_nr) and their current valid name (PR_Componentsdata.comp_text).
I added a manager to model PR_Components which executes a sql-query.
SELECT a.*, b1.* FROM pool_pr_components a
JOIN pool_pr_componentsdata b1 ON (a.id = b1.compID_id)
LEFT OUTER JOIN pool_pr_componentsdata b2
ON (a.id = b2.compID__id AND (b1.valid < b2.valid OR b1.valid = b2.valid
AND b1.id < b2.id)) WHERE b2.id IS NULL
Later I write forms dynamicly and add the sql-result to the field:
self.fields[field].queryset = sql_result
Until here, everything works fine.
My problem:
In the selectbox the result of the str-Method of model PR_Components is shown (=comp_nr), but I would like to show also the component name like "Component (001)".
How could I do this? It should be a solution which works for other models too, because a lot of my models have "historical" data.
Thanks a lot
Solution:
in model PR_Components we overwrite str with
def __str__(self):
related = PR_ComponentsData.objects.filter(id=self.id).last()
return "{} ({})".format (related.comp_text, self.comp_nr)