Created
December 5, 2016 02:20
-
-
Save diggzhang/209cb13696811f267fe8f39a6d08fb03 to your computer and use it in GitHub Desktop.
查找重复的uuid,找出重发的埋点。
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# _*_ coding:utf-8 _*_ | |
from __future__ import print_function | |
from pymongo import MongoClient | |
import datetime | |
import time | |
db = MongoClient('10.8.8.111', 27017)['eventsV4'] | |
events = db['eventV4'] | |
# 指定时间范围 | |
start = datetime.datetime(2016, 11, 30, 22, 1) | |
end = datetime.datetime(2016, 12, 1, 0, 1) | |
# start = datetime.datetime(2016, 11, 30, 16) | |
# end = datetime.datetime(2016, 12, 1, 16) | |
def find_dup_key(): | |
pipeline = [ | |
# 从指定的时间范围筛选,也可以将这里做成一个分块查询的条件,全表扫描未尝不可,只是内存可能吃不消 | |
{"$match": { | |
"serverTime": {"$gte": start, "$lt": end} | |
}}, | |
# 根据几个字段去判断键值对的唯一性,这里特别写明了{"$exists": True},必须保证需要判断的字段完成,否则会影响到后面的group | |
{"$match": { | |
"uuid": {"$exists": True}, | |
}}, | |
# 将重复的键值对group起来,并用count计数 | |
{"$group": { | |
"_id": "$uuid", | |
"count": {"$sum":1} | |
}}, | |
# 匹配count >= 2的键值对,他们就是重复的 | |
{"$match": { | |
"count": {"$gte": 2} | |
}} | |
] | |
# 配置allowDiskUse=True应对mongodb的16M limit | |
return list(events.aggregate(pipeline, allowDiskUse=True)) | |
all_dup_key = find_dup_key() | |
for doc in all_dup_key: | |
print(doc) | |
# 共计有多少重复uuid的文档 | |
print(len(all_dup_key)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment