Skip to content

Instantly share code, notes, and snippets.

@kireal
Last active February 12, 2018 18:43
Show Gist options
  • Save kireal/77b54002b17bd7285fb21b99b68e4981 to your computer and use it in GitHub Desktop.
Save kireal/77b54002b17bd7285fb21b99b68e4981 to your computer and use it in GitHub Desktop.
Run to deduplicate mongodb collection
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Deduplication collection mongodb database utility.
#
# Works only with pymongo >= 3.0
# Kireal
#
# 01.10.2016
# Last change: 04.10.2016
# TODO:
#
# exapmle of command:
# ./deduplicatemdb.py -s mongodb -p 27017 -d database -c collection -k key1,key2 -r 10000
import argparse
import pymongo
from pymongo import MongoClient
from bson.objectid import ObjectId
# create parser
argparser = argparse.ArgumentParser()
argparser.add_argument('-s', '--server_name', help='Database server name.', required=True)
argparser.add_argument('-p', '--port_number', help='Database port number.', required=False, default=27017, type=int)
argparser.add_argument('-d', '--database_name', help='Database name.', required=True)
argparser.add_argument('-c', '--collection_name', help='Collection name.', required=True)
argparser.add_argument('-k', '--key', help='Destination database name, csv format: key1,key2 etc.', required=True)
argparser.add_argument('-r', '--report', help='Report each documents.', required=False, default=1000, type=int)
# parse
namespace = argparser.parse_args()
client = MongoClient(host=namespace.server_name, port=namespace.port_number)
db = client[namespace.database_name]
collection = db[namespace.collection_name]
key = namespace.key
key = key.split(',')
whereq = dict()
sortq = list()
hashmaplist = list()
for k in key:
whereq[k] = 1
sortq.append([k, pymongo.ASCENDING])
hashmaplist.append(dict())
whereq['_id'] = 1
print "Connection string: " + str(collection)
print "Projection: " + str(whereq)
print "Sort cause: " + str(sortq)
cursor = collection.find({},whereq,no_cursor_timeout=True).sort(sortq)
cnt = cursor.count()
print "======================================="
print "Count before deduplication: " + str(cnt)
k = 0
deleted_count = 0
observed_count = 0
for d in cursor:
n = 0
for i in xrange(0,len(key)):
hasmap = hashmaplist[i]
if d[key[i]] in hasmap:
n = n + 1
else:
hasmap[d[key[i]]] = 1
observed_count = observed_count + 1
if n==len(key):
collection.delete_one({'_id': d['_id']})
deleted_count = deleted_count + 1
if k >= namespace.report:
print "Observed count " + str(observed_count) + " or " + str((observed_count*100/cnt)) + "% of all records."
print "Deleted count " + str(deleted_count) + " or " + str((deleted_count*100/observed_count)) + "% of observed records."
k = 0
k = k + 1
print "Observed count " + str(observed_count) + " or " + str((observed_count*100/cnt)) + "% of all records."
print "Deleted count " + str(deleted_count) + " or " + str((deleted_count*100/observed_count)) + "% of observed records."
cursor = collection.find({},whereq).sort(sortq)
cnt = cursor.count()
print "======================================="
print "Count after deduplication: " + str(cnt)
client.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment