Here's an example of the nonsense that Django does with multiple filter and distinct calls:
class Something(models.Model):
name = models.CharField(
_("Name"), max_length=255,
null=False, blank=True, default="")
status = models.CharField(
_("Status of the object."), max_length=255,
null=False, blank=True, choices=[
('1', '1'), ('2', '2'), ('3', '3')])
creation_timestamp = models.DateTimeField(
_("time that this was created"),
auto_now_add=True)
def __unicode__(self):
return u"%s: %s" % (self.name, self.status)
Something.objects.create(name='abc', status=1)
Something.objects.create(name='abc', status=2)
Something.objects.create(name='abc', status=3)
Something.objects.create(name='def', status=1)
Something.objects.create(name='def', status=2)
# Everything seems to be going right
objs = Something.objects.order_by('name', '-creation_timestamp').\
distinct('name')
print(objs)
# [<Something: abc: 3>, <Something: def: 2>]
objs[0]
# <Something: abc: 3>
objs[1]
# <Something: def: 2>
# But now I want to order the results by status:
ordered = objs.order_by('status')
# ...
# DatabaseError: SELECT DISTINCT ON expressions must match initial
# ORDER BY expressions
# LINE 1: SELECT DISTINCT ON ("something"."name") "somet...
# Oops, order_by(...).distinct(...).order_by(...) puts all of the
# order_by calls into the first one rather than allow a sub-select.
# Better reset our transaction so we don't have to exit the shell
from django.db import transaction
transaction.rollback_unless_managed()
# We can try to use a nested query to support this behavior:
# Try using a nested id__in query to protect against this nonsense:
nested = Something.objects.order_by(
'name', '-creation_timestamp').distinct('name')
objs = Something.objects.filter(
id__in=nested.values_list('id', flat=True)).\
order_by('status')
print(objs)
# [<Something: def: 1>, <Something: abd: 1>] # WHAT?
# Hm.. the nested queryset doesn't get resolved independently. Instead
# it's rolled into the containing queryset. This results in
# ordering by the status and then doing the distinct, which gives us
# the completely wrong result.
# What about if we just want the item named 'abc'
objs = Something.objects.order_by('name', '-creation_timestamp').\
distinct('name').filter(name='abc')
print(objs)
# [<Something: abc: 3>]
objs[0]
# <Something: abc: 3> # Looking good!
objs.get()
# <Something: abc: 1> # WHAT?
# This behavior was introduced in
# https://code.djangoproject.com/ticket/12258
# It was a performance tuning change that clears out ORDER BY from
# a query whenever the get() method is called on a queryset.
# Try using a nested id__in query to protect against this nonsense:
nested = Something.objects.order_by(
'name', '-creation_timestamp').distinct('name')
objs = Something.objects.filter(
id__in=nested.values_list('id', flat=True)).\
filter(name='abc')
print(objs)
# [<Something: abc: 3>] # So far so good
objs.get()
# <Something: abc: 1> # WHAT?
# Combination of our above problems: The inner queryset isn't resolved
# and the get() calls clears out our ORDER BY
# OK, try consuming the nested queryset into a list...
nested = Something.objects.order_by(
'name', '-creation_timestamp').distinct('name')
objs = Something.objects.filter(
id__in=list(nested.values_list('id', flat=True))).\
filter(name='abc')
print(objs)
# [<Something: abc: 3>] # Good so far
objs.get()
# <Something: abc: 3> # There it is!
# Finally, expected behavior!
objs = Something.objects.filter(
id__in=list(nested.values_list('id', flat=True))).\
order_by('status')
# [<Something: def: 2>, <Something: abc: 3>]
# Huzzah! We just have to waste a bunch of ram.
# SQLAlchemy wouldn't make us step through these hoops. Just sayin.
# What about using some custom SQL in a queryset.extra(where=[...])?
sql = "SELECT DISTINCT ON (name) id FROM something "\
"ORDER BY name ASC, creation_timestamp DESC"
objs = Something.objects.extra(where=["id IN (%s)" % sql])
objs
# [<Something: abc: 3>, <Something: def: 2>] # Nice!
obj.order_by('status')
# [<Something: def: 2>, <Something: abc: 3>] # Awesome!
objs.filter(name='abc')
# [<Something: abc: 3>]
objs.filter(name='abc').get()
# <Something: abc: 3> # SUCCESS!
# Nice! Now we have a query that does what we want without the big
# memory and cpu footprint that comes with consuming a queryset
Consuming that nested id_in queryset's values_list is a slow and possibly memory intensive operation. We obviously don't want to do that unless we have to. Using django's extra parameter to jam in a raw sql distinct query gives us what we need. It does the DISTINCT correctly and allows us to perform operations on the resulting queryset.