Skip to content

Instantly share code, notes, and snippets.

@diggzhang
Created December 5, 2016 02:20
Show Gist options
  • Save diggzhang/209cb13696811f267fe8f39a6d08fb03 to your computer and use it in GitHub Desktop.
Save diggzhang/209cb13696811f267fe8f39a6d08fb03 to your computer and use it in GitHub Desktop.
查找重复的uuid,找出重发的埋点。
# _*_ coding:utf-8 _*_
from __future__ import print_function
from pymongo import MongoClient
import datetime
import time
db = MongoClient('10.8.8.111', 27017)['eventsV4']
events = db['eventV4']
# 指定时间范围
start = datetime.datetime(2016, 11, 30, 22, 1)
end = datetime.datetime(2016, 12, 1, 0, 1)
# start = datetime.datetime(2016, 11, 30, 16)
# end = datetime.datetime(2016, 12, 1, 16)
def find_dup_key():
pipeline = [
# 从指定的时间范围筛选,也可以将这里做成一个分块查询的条件,全表扫描未尝不可,只是内存可能吃不消
{"$match": {
"serverTime": {"$gte": start, "$lt": end}
}},
# 根据几个字段去判断键值对的唯一性,这里特别写明了{"$exists": True},必须保证需要判断的字段完成,否则会影响到后面的group
{"$match": {
"uuid": {"$exists": True},
}},
# 将重复的键值对group起来,并用count计数
{"$group": {
"_id": "$uuid",
"count": {"$sum":1}
}},
# 匹配count >= 2的键值对,他们就是重复的
{"$match": {
"count": {"$gte": 2}
}}
]
# 配置allowDiskUse=True应对mongodb的16M limit
return list(events.aggregate(pipeline, allowDiskUse=True))
all_dup_key = find_dup_key()
for doc in all_dup_key:
print(doc)
# 共计有多少重复uuid的文档
print(len(all_dup_key))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment