Skip to content

Instantly share code, notes, and snippets.

@guss77
Last active June 28, 2024 04:32
Show Gist options
  • Save guss77/f8e610cfddbe02c07896 to your computer and use it in GitHub Desktop.
Save guss77/f8e610cfddbe02c07896 to your computer and use it in GitHub Desktop.
Try to recover deleted documents from a mongodb 2.4 data file. Based on https://gist.github.com/egguy/2788955 with help from https://yazadk.wordpress.com/2015/07/15/a-forensic-perspective-on-recovering-deleted-data-from-big-data-systems/#MongoDB . Make sure to change `decode_chunk` so it properly detects the objects you are trying to recover - i…
#!/usr/bin/python
"""A little script to recover deleted recording of a mongoDB db file
There's no optimization but it work and has saved me
"""
import struct
import bson
import pymongo
import sys
import mmap
def decode_chunck(chunck):
"Try to decode a chunck"
#if not bson.is_valid(chunck):
# return None
try:
result = bson.decode_all(chunck)[0]
if not result:
return None
else:
# if there's all the searched field, return it
if 'uploader' in result or 'user_id' in result: # and 'field_2' in result and 'field_3' in result:
return result
except Exception:
#print "exception"
return None
#print "no expected data"
return None
def generate_chunck(data, pos=0):
"Generator to create chunck"
print "open at: %s" % pos
f= open(data,'rb')
#a=f.read()
a = mmap.mmap(f.fileno(),0, prot=mmap.PROT_READ)
#size = len(a)
size = a.size()
max_size = 131072
while pos < size-4:
# Progress indicator
if pos % 1024 ==0:
print pos
# Determine the size of the possible bson encoded data
bson_size = struct.unpack("<I", a[pos:pos + 4])[0]
# if its deleted, guess as to its size
if bson_size == 4008636142:
print "possible deleted chunk"
scanhead = pos+8 # skip over header
# find next item
while scanhead < size-4 and a[scanhead:scanhead + 4] != '\x07_id':
scanhead += 1
print "Scanned ahead " + str(scanhead)
# figure out correct size between here and next item
ret = None
bson_size = 10
while (ret == None) and pos + bson_size < scanhead and bson_size < max_size:
bson_size += 1
while a[pos+bson_size] != '\x00':
bson_size += 1
#print "try decode with size " + str(bson_size)
ret = decode_chunck(struct.pack("<I",bson_size) + a[pos+4:pos+bson_size])
if ret:
print "Possible value with size " + str(bson_size) + ": " + repr(ret)
yield struct.pack("<I",bson_size) + a[pos+4:pos+bson_size]
pos += bson_size
else:
print "no chunk found"
pos += 1
continue
# If it's more than 2KB reject it (perfect for me)
if bson_size > 2*1024:
# Continue tu search in the file
pos += 1
continue
# If the bson is bigger than the file, reject it
if bson_size+pos > size-1:
pos += 1
continue
# A bson should end by \x00
# http://bsonspec.org/#/specification
if a[pos+bson_size] != '\x00':
pos += 1
continue
# Chunck it
chunck = a[pos:pos+bson_size]
pos += 1
yield chunck
# create connection
connection = pymongo.MongoClient('localhost', 27017)
# Connect to MongoDB in order to reinsert the data
db = connection.recover_db
collection = db.recover_collection
# argv[1] = the file to recover
# argv[2] = Where to start in the file
for chunck in generate_chunck(sys.argv[1], int(sys.argv[2])):
result = decode_chunck(chunck)
if result:
try:
print "insert"
collection.insert(result)
except pymongo.errors.DuplicateKeyError:
None
@paulokopny
Copy link

Just saved me

@ramsessal
Copy link

Any update for the latest versions?

@guss77
Copy link
Author

guss77 commented Jul 6, 2022

Apologies, but I'm not maintaining this code - it is just a gist.

If you managed to wrestle this code into working with later versions, feel free to drop a comment and I'll see about merging your changes.

@AlphaNumeric99
Copy link

Can you explain where I can get the following? Thanks

# argv[1] = the file to recover
# argv[2] = Where to start in the file 

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment