Skip to content

Instantly share code, notes, and snippets.

@hughdbrown
Last active March 2, 2018 14:57
Show Gist options
  • Save hughdbrown/1f2503db2b7929a816590a737e665409 to your computer and use it in GitHub Desktop.
Save hughdbrown/1f2503db2b7929a816590a737e665409 to your computer and use it in GitHub Desktop.
Code to checkpoint and diff mongodb databases
#!/usr/bin/env python
"""
Python module to create and diff checkpoints on mongo databases.
"""
import logging
# import json
from uuid import uuid4
from pprint import pprint
from datetime import datetime # pylint: disable=unused-import
from pymongo import MongoClient
from bson import ObjectId # pylint: disable=unused-import
logging.basicConfig(level=logging.INFO)
LOGGER = logging.getLogger(__name__)
class DictDiffer(object):
"""
Calculate the difference between two dictionaries as:
(1) items added
(2) items removed
(3) keys same in both but changed values
(4) keys same in both and unchanged values
"""
def __init__(self, current_dict, past_dict):
self.current_dict, self.past_dict = current_dict, past_dict
self.current_keys, self.past_keys = set(current_dict.keys()), set(past_dict.keys())
self.intersect = self.current_keys.intersection(self.past_keys)
def added(self):
return self.current_keys - self.intersect
def removed(self):
return self.past_keys - self.intersect
def changed(self):
return set(o for o in self.intersect
if self.past_dict[o] != self.current_dict[o])
def unchanged(self):
return set(o for o in self.intersect
if self.past_dict[o] == self.current_dict[o])
class MongoCheckpoint(object):
"""
Class to encapsulate snapshotting and later diffing
the state of a mongodb database
"""
def __init__(self, dbname, host="localhost"):
self.dbname = dbname
self.host = host
LOGGER.info("host: %s", self.host)
self.client = MongoClient(self.host)
LOGGER.info("dbname: %s", self.dbname)
self.database = self.client[dbname]
def checkpoint(self):
"""
Create a checkpoint of host
Sort of a mongoexport
"""
return {
coll_name: [ident['_id'] for ident in self.database[coll_name].find({}, {'_id': 1})]
for coll_name in self.database.collection_names(include_system_collections=False)
}
def diff_checkpoints(self, data1, data2):
""" Create the difference of two checkpoints """
def diff(database, new_obj, old_obj):
""" Helper function to generate the differences """
dd = DictDiffer(new_obj, old_obj)
new_tables = dd.added()
changed_tables = dd.changed()
try:
# These are entire tables that were added
added_dict = {
coll_name: list(database[coll_name].find({"_id": ObjectId(value)}))
for coll_name in new_tables
for value in new_obj[coll_name]
}
LOGGER.info("added_dict: %s", added_dict)
except Exception as exc:
LOGGER.exception("added_dict")
return
try:
# These are tables that have new values
same_dict = {
coll_name: list(database[coll_name].find({"_id": ObjectId(value)}))
for coll_name in changed_tables
for value in set(new_obj[coll_name]) - set(old_obj[coll_name])
}
LOGGER.info("samedict: %s", same_dict)
except Exception as exc:
LOGGER.exception("same_dict")
return
# This is a hack: for some reason, db.project records are not getting pulled in
if 'project' not in added_dict and 'project' not in same_dict:
same_dict['project'] = list(database['project'].find({"partition": {"$exists": 1}}))
try:
# Combine the two dictionaries into one
difference = added_dict.copy()
difference.update(same_dict)
LOGGER.info("difference: %s", difference)
return difference
except Exception as exc:
LOGGER.exception("difference")
return
difference = diff(self.database, data2, data1)
# filename = "{}.json".format(uuid4())
filename = "{}.data".format(uuid4())
with open(filename, "w") as handle:
# handle.write(json.dumps(difference, sort_keys=True, indent=4))
pprint(difference, stream=handle, indent=4)
LOGGER.info("Difference saved to: %s", filename)
if __name__ == '__main__':
mcp = MongoCheckpoint(dbname="some database")
# checkpoint("a database")
FILE1 = "38568e79-e23c-4481-81aa-2fd8af5ff3b9.json"
FILE2 = "a472dea0-d4c4-4469-90c7-3dc2045363db.json"
mcp.diff_checkpoints(FILE1, FILE2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment