Last active
March 2, 2018 14:57
-
-
Save hughdbrown/1f2503db2b7929a816590a737e665409 to your computer and use it in GitHub Desktop.
Code to checkpoint and diff mongodb databases
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Python module to create and diff checkpoints on mongo databases. | |
""" | |
import logging | |
# import json | |
from uuid import uuid4 | |
from pprint import pprint | |
from datetime import datetime # pylint: disable=unused-import | |
from pymongo import MongoClient | |
from bson import ObjectId # pylint: disable=unused-import | |
logging.basicConfig(level=logging.INFO) | |
LOGGER = logging.getLogger(__name__) | |
class DictDiffer(object): | |
""" | |
Calculate the difference between two dictionaries as: | |
(1) items added | |
(2) items removed | |
(3) keys same in both but changed values | |
(4) keys same in both and unchanged values | |
""" | |
def __init__(self, current_dict, past_dict): | |
self.current_dict, self.past_dict = current_dict, past_dict | |
self.current_keys, self.past_keys = set(current_dict.keys()), set(past_dict.keys()) | |
self.intersect = self.current_keys.intersection(self.past_keys) | |
def added(self): | |
return self.current_keys - self.intersect | |
def removed(self): | |
return self.past_keys - self.intersect | |
def changed(self): | |
return set(o for o in self.intersect | |
if self.past_dict[o] != self.current_dict[o]) | |
def unchanged(self): | |
return set(o for o in self.intersect | |
if self.past_dict[o] == self.current_dict[o]) | |
class MongoCheckpoint(object): | |
""" | |
Class to encapsulate snapshotting and later diffing | |
the state of a mongodb database | |
""" | |
def __init__(self, dbname, host="localhost"): | |
self.dbname = dbname | |
self.host = host | |
LOGGER.info("host: %s", self.host) | |
self.client = MongoClient(self.host) | |
LOGGER.info("dbname: %s", self.dbname) | |
self.database = self.client[dbname] | |
def checkpoint(self): | |
""" | |
Create a checkpoint of host | |
Sort of a mongoexport | |
""" | |
return { | |
coll_name: [ident['_id'] for ident in self.database[coll_name].find({}, {'_id': 1})] | |
for coll_name in self.database.collection_names(include_system_collections=False) | |
} | |
def diff_checkpoints(self, data1, data2): | |
""" Create the difference of two checkpoints """ | |
def diff(database, new_obj, old_obj): | |
""" Helper function to generate the differences """ | |
dd = DictDiffer(new_obj, old_obj) | |
new_tables = dd.added() | |
changed_tables = dd.changed() | |
try: | |
# These are entire tables that were added | |
added_dict = { | |
coll_name: list(database[coll_name].find({"_id": ObjectId(value)})) | |
for coll_name in new_tables | |
for value in new_obj[coll_name] | |
} | |
LOGGER.info("added_dict: %s", added_dict) | |
except Exception as exc: | |
LOGGER.exception("added_dict") | |
return | |
try: | |
# These are tables that have new values | |
same_dict = { | |
coll_name: list(database[coll_name].find({"_id": ObjectId(value)})) | |
for coll_name in changed_tables | |
for value in set(new_obj[coll_name]) - set(old_obj[coll_name]) | |
} | |
LOGGER.info("samedict: %s", same_dict) | |
except Exception as exc: | |
LOGGER.exception("same_dict") | |
return | |
# This is a hack: for some reason, db.project records are not getting pulled in | |
if 'project' not in added_dict and 'project' not in same_dict: | |
same_dict['project'] = list(database['project'].find({"partition": {"$exists": 1}})) | |
try: | |
# Combine the two dictionaries into one | |
difference = added_dict.copy() | |
difference.update(same_dict) | |
LOGGER.info("difference: %s", difference) | |
return difference | |
except Exception as exc: | |
LOGGER.exception("difference") | |
return | |
difference = diff(self.database, data2, data1) | |
# filename = "{}.json".format(uuid4()) | |
filename = "{}.data".format(uuid4()) | |
with open(filename, "w") as handle: | |
# handle.write(json.dumps(difference, sort_keys=True, indent=4)) | |
pprint(difference, stream=handle, indent=4) | |
LOGGER.info("Difference saved to: %s", filename) | |
if __name__ == '__main__': | |
mcp = MongoCheckpoint(dbname="some database") | |
# checkpoint("a database") | |
FILE1 = "38568e79-e23c-4481-81aa-2fd8af5ff3b9.json" | |
FILE2 = "a472dea0-d4c4-4469-90c7-3dc2045363db.json" | |
mcp.diff_checkpoints(FILE1, FILE2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment