Last active
December 4, 2019 17:38
-
-
Save valeriocos/211f5d3a51aed7cb30cd75ec3bd444fb to your computer and use it in GitHub Desktop.
compare raw and upstream gitlab data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# | |
# Copyright (C) 2015-2019 Bitergia | |
# | |
# This program is free software; you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation; either version 3 of the License, or | |
# (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with this program; if not, write to the Free Software | |
# Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA. | |
# | |
# Authors: | |
# Quan Zhou <[email protected]> | |
# Valerio Cosentino <[email protected]> | |
# | |
import argparse | |
import datetime | |
import elasticsearch as es | |
import elasticsearch_dsl as dsl | |
import requests | |
import time | |
OPENED = 'opened' | |
CLOSED = 'closed' | |
ISSUES = 'issues' | |
MRS = 'merge_requests' | |
GITLAB_API_URL = "https://gitlab.com/api/v4" | |
def parse_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--host", | |
help="Elasticsearch host") | |
parser.add_argument("-i", "--index", | |
help="Index name") | |
parser.add_argument("-o", "--origins", nargs='+', | |
help="Origin name") | |
parser.add_argument("-t", "--tokens", nargs='+', | |
help="Gitlab tokens") | |
parser.add_argument("-s", "--states", default=[OPENED, CLOSED], nargs='+', | |
help="Issue/MRs states") | |
parser.add_argument("-c", "--category", choices=[ISSUES, MRS], default=ISSUES, | |
help="Category issue/mrs") | |
parser.add_argument('--gte-iids', default=None, | |
help="Consider iids greater or equal than the target one") | |
parser.add_argument('--target-iid', default=None, | |
help="Find specific iid") | |
args = parser.parse_args() | |
return args | |
def get_iids_from_index(es_host, index, origin, states): | |
"""Get all iids in the index. | |
:param es_host: ES host | |
:param index: index | |
:param origin: origin | |
:param states: issue states either closed or opened | |
:return: list of the MRs iid | |
""" | |
project_id = None | |
client = es.Elasticsearch(es_host, timeout=90) | |
s = dsl.Search(using=client, index=index).query("match", origin=origin) | |
s = s.source(['data.iid', 'data.state', 'data.project_id']) | |
index_iid = [] | |
for hit in s.scan(): | |
project_id = hit['data']['project_id'] | |
if hit['data']['state'] in states: | |
index_iid.append(hit['data']['iid']) | |
index_iid.sort() | |
return index_iid, project_id | |
def no_exist(tokens, index, category, gte_iids, target_iid, project_id): | |
tpos = 0 | |
removed = [] | |
header = {'PRIVATE-TOKEN': tokens[tpos]} | |
for i in index: | |
if gte_iids: | |
if i < gte_iids: | |
continue | |
if target_iid: | |
if i == target_iid: | |
break | |
url = "{}/projects/{}/{}/{}".format(GITLAB_API_URL, project_id, category, i) | |
re = requests.get(url=url, headers=header) | |
msg = "{} OK".format(url) | |
if re.status_code != 200: | |
removed.append(i) | |
msg = "{} not found".format(url) | |
print(msg) | |
ratelimit_remaining = int(re.headers['RateLimit-Remaining']) | |
ratelimit_reset = int(re.headers['RateLimit-Reset']) | |
if ratelimit_remaining < 5: | |
tpos += 1 | |
if tpos > len(tokens): | |
tpos = 0 | |
header = {'PRIVATE-TOKEN': tokens[tpos]} | |
if ratelimit_remaining < 2: | |
while datetime.datetime.now() < datetime.datetime.fromtimestamp(ratelimit_reset): | |
time.sleep(1) | |
return removed | |
def main(): | |
"""Print a list of the iid that not in the index comparing with origin.""" | |
args = parse_args() | |
index = args.index | |
es_host = args.host | |
origins = args.origins | |
states = args.states | |
category = args.category | |
gte_iids = args.gte_iids | |
target_iid = args.target_iid | |
for origin in origins: | |
print("*** *** *** ***") | |
print("start: {}".format(origin)) | |
index_iids, project_id = get_iids_from_index(es_host, index, origin, states) | |
print("items in raw: {}".format(len(index_iids))) | |
tokens = args.tokens | |
removed = no_exist(tokens, index_iids, category, gte_iids, target_iid, project_id) | |
bucket = [] | |
for i in range(len(removed)): | |
if i % 10 == 0 and bucket: | |
print(bucket) | |
bucket = [] | |
bucket.append(removed[i]) | |
if bucket: | |
print(bucket) | |
print("not found: {}".format(len(removed))) | |
print("end: {}".format(origin)) | |
print("*** *** *** ***") | |
if __name__ == '__main__': | |
""" | |
--host https://... | |
-i index | |
-o origin-1, origin-2, origin-3, ... | |
-t token-1 token-2 | |
""" | |
main() | |
""" | |
POST /<index>/_delete_by_query?refresh | |
{ | |
"query": { | |
"bool": { | |
"must": { | |
"term": { | |
"origin": "<origin>" | |
} | |
}, | |
"filter": { | |
"terms": { | |
"id_in_repo": [<value-1>, <value-2] | |
} | |
} | |
} | |
} | |
} | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment