Last active
February 12, 2018 18:43
-
-
Save kireal/77b54002b17bd7285fb21b99b68e4981 to your computer and use it in GitHub Desktop.
Run to deduplicate mongodb collection
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# | |
# Deduplication collection mongodb database utility. | |
# | |
# Works only with pymongo >= 3.0 | |
# Kireal | |
# | |
# 01.10.2016 | |
# Last change: 04.10.2016 | |
# TODO: | |
# | |
# exapmle of command: | |
# ./deduplicatemdb.py -s mongodb -p 27017 -d database -c collection -k key1,key2 -r 10000 | |
import argparse | |
import pymongo | |
from pymongo import MongoClient | |
from bson.objectid import ObjectId | |
# create parser | |
argparser = argparse.ArgumentParser() | |
argparser.add_argument('-s', '--server_name', help='Database server name.', required=True) | |
argparser.add_argument('-p', '--port_number', help='Database port number.', required=False, default=27017, type=int) | |
argparser.add_argument('-d', '--database_name', help='Database name.', required=True) | |
argparser.add_argument('-c', '--collection_name', help='Collection name.', required=True) | |
argparser.add_argument('-k', '--key', help='Destination database name, csv format: key1,key2 etc.', required=True) | |
argparser.add_argument('-r', '--report', help='Report each documents.', required=False, default=1000, type=int) | |
# parse | |
namespace = argparser.parse_args() | |
client = MongoClient(host=namespace.server_name, port=namespace.port_number) | |
db = client[namespace.database_name] | |
collection = db[namespace.collection_name] | |
key = namespace.key | |
key = key.split(',') | |
whereq = dict() | |
sortq = list() | |
hashmaplist = list() | |
for k in key: | |
whereq[k] = 1 | |
sortq.append([k, pymongo.ASCENDING]) | |
hashmaplist.append(dict()) | |
whereq['_id'] = 1 | |
print "Connection string: " + str(collection) | |
print "Projection: " + str(whereq) | |
print "Sort cause: " + str(sortq) | |
cursor = collection.find({},whereq,no_cursor_timeout=True).sort(sortq) | |
cnt = cursor.count() | |
print "=======================================" | |
print "Count before deduplication: " + str(cnt) | |
k = 0 | |
deleted_count = 0 | |
observed_count = 0 | |
for d in cursor: | |
n = 0 | |
for i in xrange(0,len(key)): | |
hasmap = hashmaplist[i] | |
if d[key[i]] in hasmap: | |
n = n + 1 | |
else: | |
hasmap[d[key[i]]] = 1 | |
observed_count = observed_count + 1 | |
if n==len(key): | |
collection.delete_one({'_id': d['_id']}) | |
deleted_count = deleted_count + 1 | |
if k >= namespace.report: | |
print "Observed count " + str(observed_count) + " or " + str((observed_count*100/cnt)) + "% of all records." | |
print "Deleted count " + str(deleted_count) + " or " + str((deleted_count*100/observed_count)) + "% of observed records." | |
k = 0 | |
k = k + 1 | |
print "Observed count " + str(observed_count) + " or " + str((observed_count*100/cnt)) + "% of all records." | |
print "Deleted count " + str(deleted_count) + " or " + str((deleted_count*100/observed_count)) + "% of observed records." | |
cursor = collection.find({},whereq).sort(sortq) | |
cnt = cursor.count() | |
print "=======================================" | |
print "Count after deduplication: " + str(cnt) | |
client.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment