Created
February 8, 2012 18:23
-
-
Save Bpless/1771930 to your computer and use it in GitHub Desktop.
MongoEngine Compression Code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from mongoengine.base import TopLevelDocumentMetaclass | |
class CompressedKeyDocumentMetaclass(TopLevelDocumentMetaclass): | |
def __new__(cls, name, bases, attrs): | |
""" | |
MongoEngine Document Classes access the 'TopLevelDocumentMetaclass' | |
__metaclass__. We allow that metaclass to set attrs on the class | |
and then compress the fields in the event that the instantiated | |
Document Class contains the meta attr 'compress_keys' | |
That is not the most efficient flow. Going forward, we should | |
either fork MongoEngine and insert this logic directly into | |
the TopLevelDocumentMetaclass | |
OR | |
process attrs before instantiating the new_class | |
""" | |
new_class = super(CompressedKeyDocumentMetaclass, cls).__new__(cls, name, bases, attrs) | |
if ('meta' in attrs and attrs['meta'].get('compress_keys', False)): | |
if hasattr(new_class, '_fields'): | |
key_mapping = new_class._map_fields() | |
# HANDLE INDEX CREATION HERE by resetting cls._meta['indexes] | |
if new_class._meta.get('indexes'): | |
for index in new_class._meta.get('indexes'): | |
fields = index['fields'] | |
i_list = [] | |
for f in fields: | |
raw_field_name = f[0] | |
compressed_name = key_mapping[raw_field_name] | |
direction = f[1] | |
i_list.append((compressed_name, direction)) | |
index['fields'] = i_list | |
return new_class | |
@property | |
def _mapping_collection(cls): | |
""" | |
Connects to (or produces on first lookup) a mapping collection | |
whose name is created by appending '_mapping' to the MongoEngine class | |
""" | |
collection_name = '%s_mapping' % cls._get_collection_name() | |
return getattr(connection, collection_name) | |
def _is_embedded_field(cls, field): | |
""" | |
Checks whether a given field is an EmbeddedObject | |
""" | |
return hasattr(field, 'field') and getattr(field, 'field') is not None | |
def _field_name_set(cls, subfield=None): | |
""" | |
Returns a set of all field names within that nested level | |
If field is embedded, this method returns nested level field names | |
""" | |
if not subfield: | |
fields = cls._fields.values() | |
else: | |
fields = subfield.field.document_type._fields.values() | |
return set(f.name for f in fields) | |
def _set_fields(cls, fields, collection=None, document=None): | |
""" | |
Set mapped collection values here. Handles all fields the first | |
time a class is evaluated. Subsequently, handles only changed fields | |
Records the uncompressed field name, the compressed field name, | |
and the datetime at which a field is added to the class | |
Compressed names represent the minimum unique, sequential slices | |
of a full string. | |
'test' --> 't' | |
'trial' --> 'tr' | |
We could avoid multiple chars here in a variety of ways; | |
Advantage of this route is that compressed field name more clearly | |
relate to uncompressed names. | |
Logic in the range iterator attempts to handle collisions such as: | |
'rechandler' --> 'r' | |
'recluse' --> 're' | |
'recsize' --> 'rec' | |
'rec' --> | |
May not be necessary (or elegant). | |
Embedded fields are handled recursively. May be possible to compress | |
directly on EmbeddedObject class but it was not working for me. | |
Should revisit that possibility. | |
TODO: Handle setting of embedded fields who parent field has not changed. | |
""" | |
import random | |
old_fields = dict((k, v) for k, v in document.items()) if document else {} | |
old_fields_name_set = set(f.get('db_key') for f in old_fields.values()) | |
new_fields_dict = {} | |
for f in fields: | |
f_len = len(f.db_field) | |
if f.db_field not in ('_id', '_cls', '_types'): | |
# Avoid edge case where substrings collide | |
for i in xrange(f_len + 5): | |
packed_name = f.db_field[:i + 1] | |
if not old_fields_name_set or packed_name not in old_fields_name_set: | |
new_fields_dict[f.name] = {'db_key': packed_name, 'set': datetime.datetime.now()} | |
old_fields_name_set.add(packed_name) | |
f.db_field = packed_name | |
break | |
if i > f_len: | |
# Check if value has been set successfully, otherwise append a random digit | |
f.db_field = '%s_%d' % (packed_name, + random.randrange(1, 10)) | |
else: | |
new_fields_dict[f.db_field] = {'db_key': f.db_field, 'set': datetime.datetime.now()} | |
# Handle Embedded Documents recursively | |
if cls._is_embedded_field(f): | |
embedded_fields = cls._set_fields(f.field.document_type._fields.values(), document=document) | |
embed_dict = {} | |
for embed in embedded_fields: | |
embed_dict[embed.name] = {'db_key': embed.db_field, 'set': datetime.datetime.now()} | |
new_fields_dict[f.name].update({'embedded_fields': embed_dict}) | |
if collection: | |
if document: | |
obj = {'%s.db_key' % old_fields.items()[0][0]: old_fields.items()[0][1].get('db_key')} | |
collection.update(obj, {'$set': new_fields_dict}) | |
else: | |
collection.save(new_fields_dict) | |
return new_fields_dict | |
else: | |
return fields | |
def _unset_fields(cls, collection, field_key, field_value, document, embedded_key=None, embedded_key_packed=None): | |
""" | |
Unsets mapped fields by looking up the appropriate key in the mapped | |
collection document and adding an "unset" attribute (refactor naming here to avoid modifier confusion). | |
This marks the datetime that the field was inactivated, rather than deleting the field. | |
If you were to delete the field, new compacted names could conflict with existing documents | |
in the collection. Embedded fields are unset as well. | |
""" | |
if field_key not in ('_id', '_cls', '_types'): | |
if not embedded_key: | |
old_doc = {'%s.db_key' % (field_key): field_value} | |
new_doc = {'%s.unset' % field_key: datetime.datetime.now()} | |
collection.update(old_doc, {'$set': new_doc}) | |
else: | |
old_doc = {'%s.embedded_fields.%s.db_key' % (field_key, embedded_key): embedded_key_packed} | |
new_doc = {'%s.embedded_fields.%s.unset' % (field_key, embedded_key): datetime.datetime.now()} | |
collection.update(old_doc, {'$set': new_doc}) | |
def _pack_field(cls, field, dict_key, dict_value): | |
if dict_key == field.name: | |
field.db_field = dict_value.get('db_key') | |
return field.db_field | |
def _map_fields(cls): | |
collection = cls._mapping_collection | |
meta_keys_doc = collection.find_one() | |
cls_fields = cls._fields.values() | |
cls_field_set = cls._field_name_set() | |
if not meta_keys_doc: | |
meta_keys_doc = cls._set_fields(cls_fields, collection=collection) | |
else: | |
new_fields = [f for f in cls_fields \ | |
if (f.name not in meta_keys_doc.keys() | |
and f.name is not None)] | |
if new_fields: | |
fields_dict = cls._set_fields(new_fields, collection=collection, document=meta_keys_doc) | |
meta_keys_doc.update(fields_dict) | |
key_mapping = {} | |
for field_key, field_value in meta_keys_doc.items(): | |
# Unset inactive top level fields | |
if not field_key in cls_field_set and not meta_keys_doc[field_key].get('unset'): | |
cls._unset_fields(collection, field_key, field_value['db_key'], meta_keys_doc) | |
else: | |
for cf in cls_fields: | |
# Unset inactive embedded fields | |
if cls._is_embedded_field(cf): | |
for k, v in meta_keys_doc[cf.name]['embedded_fields'].items(): | |
embed_field_set = cls._field_name_set(cf) | |
if not v.get('unset') and k not in embed_field_set: | |
cls._unset_fields(collection, cf.name, cf.db_field, meta_keys_doc, embedded_key=k, embedded_key_packed=v.get('db_key')) | |
if field_key == cf.name: | |
# Map all active field names within the class obj to compacted names | |
# Happens everytime as opposed to the _set_fields method | |
key_mapping[field_key] = cls._pack_field(cf, field_key, field_value) | |
if cls._is_embedded_field(cf): | |
for f in cf.field.document_type._fields.values(): | |
sub_key = field_value.get('embedded_fields').get(f.name) | |
if sub_key: | |
f.db_field = sub_key.get('db_key') | |
return key_mapping |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment