Skip to content

Instantly share code, notes, and snippets.

@gartenfeld
Created December 3, 2014 17:51
Show Gist options
  • Select an option

  • Save gartenfeld/a970d7295dd4cbaa0be8 to your computer and use it in GitHub Desktop.

Select an option

Save gartenfeld/a970d7295dd4cbaa0be8 to your computer and use it in GitHub Desktop.
Removing duplicates using aggregation.
from bs4 import BeautifulSoup
import re # Regular Expressions
import collections # Data Types
import sys # File operations
import codecs # UniCode support
import os
from pymongo import Connection # For DB Connection
from pymongo.errors import ConnectionFailure # For catching exeptions
def drop_dups(database, collection):
# Connect to MongoDB
print("Connecting to database...")
try:
db_connection = Connection(host="localhost", port=27017)
print ("Connected to MongoDB successfully!")
except (ConnectionFailure, e):
sys.stderr.write("Could not connect to MongoDB: %s" % e)
# Specify a database
db = db_connection[database]
# Use aggregation to select all duplicate sets
print ("Aggregating Documents...")
dup_sets = db[collection].aggregate([
{ "$group":
{
"_id": {
"headword" : "$headword",
"senses" : "$senses"
},
"uniqueIds": { "$addToSet": "$_id" },
"count": { "$sum": 1 }
}
},
{ "$match":
{ "count": { "$gt": 1 } }
}
], cursor={}, allowDiskUse=True)
total_count = 0
for doc in dup_sets:
dups = doc["uniqueIds"]
for i, doc_id in enumerate(dups):
if i>0: # After the first instance
# Grab document by _id
dup_doc = db[collection].find_one({"_id": doc_id})
# Delete document
db[collection].remove(dup_doc)
total_count += 1
print ("#" + str(i+1), dup_doc["headword"] + " removed.")
print (str(total_count) + " duplicates deleted.")
return
if __name__ == '__main__':
db_name = "stage"
collection_name = "sanat"
drop_dups(db_name, collection_name)
print("Valmis!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment