-
-
Save FND/1244491 to your computer and use it in GitHub Desktop.
Gemfile.lock |
#!/usr/bin/env ruby | |
# encoding: UTF-8 | |
# test case for eager loading of nested associations with DataMapper | |
require 'rubygems' | |
require 'dm-core' | |
require 'dm-constraints' | |
require 'dm-migrations' | |
require 'eager_loading' | |
DataMapper::Logger.new($stdout, :debug) | |
DataMapper.setup(:default, "sqlite3://#{Dir.pwd}/db.sqlite") | |
class Person | |
include DataMapper::Resource | |
property :id, Serial | |
property :name, String, :required => true | |
has n, :vehicles | |
end | |
class Vehicle | |
include DataMapper::Resource | |
property :id, Serial | |
property :name, String, :required => true | |
has n, :components | |
end | |
class Component | |
include DataMapper::Resource | |
property :id, Serial | |
property :name, String, :required => true | |
belongs_to :manufacturer | |
end | |
class Manufacturer | |
include DataMapper::Resource | |
property :id, Serial | |
property :name, String, :required => true | |
end | |
DataMapper.auto_migrate! | |
# generate test data | |
Person.create(:name => "FND", :vehicles => [ | |
Vehicle.create(:name => "Taurus", :components => [ | |
Component.create(:name => "engine", | |
:manufacturer => Manufacturer.create(:name => "Ford")), | |
Component.create(:name => "radio", | |
:manufacturer => Manufacturer.create(:name => "Bose")) | |
]), | |
Vehicle.create(:name => "fixie", :components => [ | |
Component.create(:name => "frame", | |
:manufacturer => Manufacturer.create(:name => "Campagnolo")), | |
Component.create(:name => "breaks", | |
:manufacturer => Manufacturer.create(:name => "Shimano")) | |
]) | |
]) | |
Person.create(:name => "tillsc", :vehicles => [ | |
Vehicle.create(:name => "Golf", :components => [ | |
Component.create(:name => "engine", | |
:manufacturer => Manufacturer.create(:name => "VW")) | |
]) | |
]) | |
# retrieve data | |
puts "", "[INFO] test case A" | |
person = Person.get!(1) | |
puts person.vehicles.components.manufacturer.map(&:name).join(", ") | |
puts "", "[INFO] test case B" | |
people = Person.all | |
people.each do |person| | |
person.vehicles.each do |vehicle| | |
puts sprintf("%-10s %-10s", person.name, vehicle.name) | |
end | |
end | |
puts "", "[INFO] test case C ===== /!\ n+1 hazard ====" | |
people = Person.all | |
people.each do |person| | |
person.vehicles.each do |vehicle| | |
vehicle.components.each do |component| | |
puts sprintf("%-10s %-10s %-10s", person.name, vehicle.name, component.name) | |
end | |
end | |
end | |
puts "", "[INFO] test case D" | |
people = Person.all | |
people.eager_load(Person.vehicles.components).each do |person| | |
person.vehicles.each do |vehicle| | |
vehicle.components.each do |component| | |
puts sprintf("%-10s %-10s %-10s", person.name, vehicle.name, component.name) | |
end | |
end | |
end |
# manual eager loading for DataMapper | |
# adapted from Chris Corbyn: https://gist.github.com/1244491#gistcomment-56797 | |
module EagerLoading | |
def eager_load(query_path) | |
scope = self | |
query_path.relationships.each do |relation| | |
source_key = relation.source_key.first # TODO: rename | |
target_key = relation.target_key.first # TODO: rename | |
# for each level in the query path, collect all the resources referencing | |
# keys at the current scope | |
next_scope = relation.target_model.all(target_key.name => scope. | |
collect(&:"#{source_key.name}")) | |
# map target keys to the resources that exist for them | |
links = next_scope.inject({}) do |map, resource| | |
map.merge(target_key.get(resource) => [resource]) { |k, v1, v2| v1 + v2 } | |
end | |
# now pre-load those from the map | |
scope.each do |parent| | |
if links.key?(source_key.get(parent)) | |
parent.instance_variable_set(:"@#{relation.name}", | |
links[source_key.get(parent)]) | |
end | |
end | |
# and step into the next nesting level | |
scope = next_scope | |
end | |
self | |
end | |
end | |
DataMapper::Collection.send(:include, EagerLoading) |
source :rubygems | |
DM_VERSION = '~> 1.2.0.rc2' | |
gem 'dm-core', DM_VERSION | |
gem 'dm-constraints', DM_VERSION | |
gem 'dm-migrations', DM_VERSION | |
gem 'dm-sqlite-adapter', DM_VERSION |
#!/usr/bin/env sh | |
rm db.sqlite | |
# reformat SQL queries for readability | |
bundle exec ./dm_el.rb | perl -pe 's#SELECT .*? (FROM ".*?" )(.*)#\1SELECT ... \2#' |
Turns out overriding #components
is not needed... DM already uses that instance variable name.
This should do it:
class EagerLoader
def initialize(collection)
@collection = collection
end
def load(query_path)
scope = @collection
query_path.relationships.each do |relation|
# for each level in the query path, collect all the resources referencing keys at the current scope
next_scope = relation.target_model.all(relation.target_key.first.name => scope.collect(&:"#{relation.source_key.first.name}"))
# create a map between the target keys and the resources that exist for them
links = next_scope.inject({}) do |map, resource|
map.merge(relation.target_key.first.get(resource) => [resource]) { |k, v1, v2| v1 + v2 }
end
# now pre-load those from the map
scope.each do |parent|
if links.key?(relation.source_key.first.get(parent))
parent.instance_variable_set(:"@#{relation.name}", links[relation.source_key.first.get(parent)])
end
end
# and step into the next nesting level
scope = next_scope
end
end
end
You use it like this:
people = Person.all
EagerLoader.new(people).load(People.vehicles.components)
In the above example, 3 queries are executed as soon as EagerLoader#load
is invoked (one for all the Person resources, one for all the Vehicle resources and one for all the Component resources). I don't believe it can be delayed until iteration without being considerably more complex, though certainly not impossible.
Now when you iterate them, no further SQL is executed, since the resources have all been eager-loaded.
I am tempted to take this and embed it in DataMapper::Collection
so that you can invoke the pre-loading at the latest time possible, e.g.
Personal.all.preload(Person.vehicles.components).each do |person|
...
end
That's actually a no-brainer. Having it happen automagically would be nice though. I don't doubt that it's complicated otherwise the DM guys would have done it already :)
Integrated into DataMapper::Collection.
module EagerLoading
def eager_load(query_path)
scope = self
query_path.relationships.each do |relation|
# for each level in the query path, collect all the resources referencing keys at the current scope
next_scope = relation.target_model.all(relation.target_key.first.name => scope.collect(&:"#{relation.source_key.first.name}"))
# create a map between the target keys and the resources that exist for them
links = next_scope.inject({}) do |map, resource|
map.merge(relation.target_key.first.get(resource) => [resource]) { |k, v1, v2| v1 + v2 }
end
# now pre-load those from the map
scope.each do |parent|
if links.key?(relation.source_key.first.get(parent))
parent.instance_variable_set(:"@#{relation.name}", links[relation.source_key.first.get(parent)])
end
end
# and step into the next nesting level
scope = next_scope
end
self
end
end
DataMapper::Collection.send(:include, EagerLoading)
Person.all.eager_load(Person.vehicles.components).each do |person|
person.vehicles.each do |vehicle|
vehicle.components.each do |component|
...
end
end
end
@d11wtq — I actually like the version that uses an external EagerLoader instance, since that will allow decomposing the fairly complex #eager_load method without polluting the Collection method namespace. Also, it will be easier to unit test. That said, adding an interface to Collection for launching an EagerLoader makes perfect sense (and provides a nice API, a little nicer even than AR's Model.includes, IMO).
I would propose:
class DataMapper::Collection::EagerLoader
attr_reader :collection
attr_reader :query_path
def initialize(collection, query_path)
@collection = collection
@query_path = query_path
end
def load
# method body from your example
end
end
module DataMapper::Collection::EagerLoading
def eager_load(query_path)
EagerLoader.new(self, query_path).load
end
end
Also, this looks like it won't work with composite primary keys (eg., relation.target_key.first.name
). I believe you can use a relationship directly as a predicate in a query, and still pass an array or collection of instances as the value for that predicate. So, I think you could do:
# cutting to the chase
next_scope = relation.target_model.all(relation => scope)
And then use the full target_key
and source_key
(they're PropertySet
instances, which provide #get
and #set
, so you can use them somewhat like Property
and Relationship
), ex:
map.merge(relation.target_key.get(resource) => [resource]) { |k, v1, v2| v1 + v2 }
Is it worth any of this being pulled back into dm-core, if it's solid enough after a bit of input from several devs? It's almost certainly worth making into a little gem if nothing else. It would need to behave the same as a non-eager-loaded collection first though (i.e. load Collection.new(query, resources)
instead of just an Array).
In a perfect world, this would be totally transparent to the user (as it currently works for a single level), but I think there are some pretty tricky issues involved with doing it transparently. To be honest, I've never looked that closely at the eager loading code, so I may just be spreading my own ignorance (and fear) here :).
That said, arbitrary depth eager loading gets my vote as a feature to roll into dm-core. /cc @dkubb, @solnic—any thoughts about prospects for rolling a polished, tested implementation of this into dm-core?
In my head it wouldn't be too difficult to do this transparently. You'd create a thread-global stack, since you can only be iterating one collection at a time, right? So for each nested loop, the stack grows by 1 and as the loop ends and, either, another one begins, or the outer one ends, the stack continues to either grow or shrink. The stack just contains what I'm storing in the scope variable, so you know what model the parent is at any given point. Seems like it should work with any nesting level.
I'm clearly over-simplifying a complex situation, but, ya know, "in my head" it work like this :P
Here's a version that has been split into a class and a wrapper module, supports composite keys (I think), uses real collections exactly the same as if they were not eager-loaded and has been refactored into small methods (yet seems to feel more complex? o_O).
module DataMapper
class Collection
# EagerLoader takes a QueryPath object and loads all relationships referenced in the path, into an existing Collection.
#
# Using eager-loading allows you to optimize out the classic "n+1" query problem, when you intend to iterate over several
# arbitrarily deeply nested relationships. One query per relationship is executed, vs one query per record.
class EagerLoader
# Initialize the EagerLoader to pre-load all relationships as deep as +query_path+ into +collection+.
#
# @param [Collection] collection
# the source collection to load related resources into
#
# @param [QueryPath] query_path
# a valid QueryPath for the target model in the collection
def initialize(collection, query_path)
@collection = collection
@query_path = query_path
end
# Perform eager loading immediately and return the collection.
#
# The number of queries executed is identical to the number of relationships in the query path.
# This method should not be invoked before query-building is complete (e.g. before limits have been applied)
#
# @return [Collection]
# the source collection resources were loaded into
def load
scope = @collection
@query_path.relationships.each do |relation|
next_scope = relation.target_model.all(target_conditions(scope, relation))
load_into_collection(scope, next_scope, relation)
scope = next_scope
end
@collection
end
private
# Map target key names to source key values, to create valid query conditions for finding all related resources
def target_conditions(collection, relationship)
collection.inject({}) do |conditions, resource|
# FIXME: Reduce the complexity here
conditions.merge!(Hash[relationship.target_key.collect(&:name).zip(relationship.source_key.get(resource))]) do |key_name, v1, v2|
Array(v1) + Array(v2)
end
end
end
# Given a set of resources and the relationship from which they derive, map them, referenced by the target key
def key_mappings(collection, relationship)
collection.inject({}) do |map, resource|
map.merge(relationship.target_key.get(resource) => [resource]) do |key, resources1, resources2|
resources1 + resources2
end
end
end
# For each of the pre-loaded resources in +related_resources+, re-establish their relationships in +collection+
def load_into_collection(collection, related_resources, relationship)
map = key_mappings(related_resources, relationship)
collection.each do |resource|
if map.key?(relationship.source_key.get(resource))
resource.instance_variable_set(
:"@#{relationship.name}",
relationship.collection_for(resource).set(map[relationship.source_key.get(resource)])
)
end
end
end
end
module EagerLoading
def eager_load(query_path)
EagerLoader.new(self, query_path).load
end
end
include EagerLoading
end
end
It occurs to me that this is very focused on 1:n relationships. It will need changes to work for other types of relationship. I won't have time to look at it further tonight as there are some other things I want to get started, but hopefully tomorrow I can pick this back up and make sure it works with other types of relationships, plus get some spec coverage on it.
I agree, this does look a little more complicated than the last iteration, but I'll chalk that up to the collections and composite key support.
Did it not work to use a Relationship as a query predicate? (eg., instead of Hash[relationship.target_key.collect(&:name).zip(relationship.source_key.get(resource))]
, do relationship => resource
). There's definitely code in place to support that API (DataMapper::Query::Conditions::Comparison::RelationshipHandler#foreign_key_mapping
), but I'm not sure if it will work here.
One teeny thing: I think you can use Relationship#set
instead of directly setting the instance variable on resource in #load_into_collection
. Currently, it's:
resource.instance_variable_set(
:"@#{relationship.name}",
relationship.collection_for(resource).set(map[relationship.source_key.get(resource)])
)
But I think it can be (I also introduced a couple of explanatory temporary variables):
collection = relationship.collection_for(resource)
eager_loaded_resources = map[relationship.source_key.get(resource)]
collection.set(eager_loaded_resources)
relationship.set(resource, collection)
Re: relationship as a predicate, I tried something (I forget what) and while it didn't error, it didn't produce the correct results. I almost certainly did the wrong thing though. Are you able to check out the original gist, drop in my code then see if you get the correct output with your suggested simplification?
You should see the following under the "test case C" heading:
[INFO] test case C ===== /!\ n+1 hazard ====
~ (0.000043) SELECT "id", "name" FROM "people" ORDER BY "id"
~ (0.000055) SELECT "id", "name", "person_id" FROM "vehicles" WHERE "person_id" IN (1, 2) ORDER BY "id"
~ (0.000086) SELECT "id", "name", "manufacturer_id", "vehicle_id" FROM "components" WHERE "vehicle_id" IN (1, 2, 3) ORDER BY "id"
FND Taurus engine
FND Taurus radio
FND fixie frame
FND fixie breaks
tillsc Golf engine
I always get a bit lost poking around in dm-core, but I always learn something new. It would be good to clear that "FIXME" :)
Awesome advice in Relationship#set
! Will try that. Classic example of getting a bit lost :P (Excuse the fragmented response... I'm a bit distracted with food and another coding problem simultaneously ;))
Hmm, I was just looking through some of Relationship
, and I noticed #eager_load
, which looks like exactly the API for this situation. So if I'm reading it correctly, you could simplify the whole thing to:
def load(query_path)
query_path.relationships.inject(@collection) do |scope, relationship|
relationship.eager_load(scope)
end
end
(BTW, I've vacillated, and I now prefer passing the Query::Path
to #load
instead of #initialize
(reverting to how you had it before). It seems more coherent, even though EagerLoader is probably still a single-use object.)
It's too late here for me to start hacking on this right now, but I'm going to try to make time to poke at it tomorrow.
@d11wtq—looks like Relationship#eager_load
does work; check out my fork of this gist: https://gist.github.com/1297105
Hey guys, I know this is an old thread, but check this out: https://gist.github.com/3100034
First thing that came to mind (not finished, but effectively an algorithm that works):