Created
May 25, 2012 16:04
-
-
Save egguy/2788955 to your computer and use it in GitHub Desktop.
Recover deleted data from mongo DB database
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""A little script to recover deleted recording of a mongoDB db file | |
There's no optimization but it work and has saved me | |
""" | |
import struct | |
import bson | |
import pymongo | |
import sys | |
def decode_chunck(chunck): | |
"Try to decode a chunck" | |
#if not bson.is_valid(chunck): | |
# return None | |
try: | |
result = bson.decode_all(chunck)[0] | |
if not result: | |
return None | |
else: | |
# if there's all the searched field, return it | |
if 'field_1' in result and 'field_2' in result and 'field_3' in result: | |
return result | |
except Exception: | |
return None | |
return None | |
def generate_chunck(data, pos=0): | |
"Generator to create chunck" | |
print "open at: %s" % pos | |
f= open(data,'rb') | |
a=f.read() | |
size = len(a) | |
while pos < size: | |
# Progress indicator | |
if pos % 1024 ==0: | |
print pos | |
# Determine the size of the possible bson encoded data | |
bson_size = struct.unpack("<I", a[pos:pos + 4])[0] | |
# If it's more than 2KB reject it (perfect for me) | |
if bson_size > 2*1024: | |
# Continue tu search in the file | |
pos += 1 | |
continue | |
# If the bson is bigger than the file, reject it | |
if bson_size+pos > size-1: | |
pos += 1 | |
continue | |
# A bson should end by \x00 | |
# http://bsonspec.org/#/specification | |
if a[pos+bson_size] != '\x00': | |
pos += 1 | |
continue | |
# Chunck it | |
chunck = a[pos:pos+bson_size] | |
pos += 1 | |
yield chunck | |
# create connection | |
connection = pymongo.Connection('localhost', 27017) | |
# Connect to MongoDB in order to reinsert the data | |
db = connection.recover_db | |
collection = db.recover_collection | |
# argv[1] = the file to recover | |
# argv[2] = Where to start in the file | |
for chunck in generate_chunck(sys.argv[1], int(sys.argv[2])): | |
result = decode_chunck(chunck) | |
if result: | |
print "insert" | |
collection.insert(result) | |
This didn't work for me on Mongo 2.4, apparently the file format has changed and now mongo overwrites the object size with \xee\xee\xee\xee. I have a gist here: https://gist.github.com/guss77/f8e610cfddbe02c07896 that modifies the above code to try to guess at the recoverable size of an object.
Showing : ./recover.py: line 8: syntax error near unexpected token (' ./recover.py: line 8:
def decode_chunck(chunck):
@egguy can u please give an example for argv[1] and argv[2], since i dont know which file do we have to feed it.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Could you please show sample of argv1 and argv2?
and Where can found the file in both argv ?
Thank you in advance.