Skip to content

Instantly share code, notes, and snippets.

@valeriocos
Last active December 4, 2019 17:38
Show Gist options
  • Save valeriocos/211f5d3a51aed7cb30cd75ec3bd444fb to your computer and use it in GitHub Desktop.
Save valeriocos/211f5d3a51aed7cb30cd75ec3bd444fb to your computer and use it in GitHub Desktop.
compare raw and upstream gitlab data
# -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2019 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA.
#
# Authors:
# Quan Zhou <[email protected]>
# Valerio Cosentino <[email protected]>
#
import argparse
import datetime
import elasticsearch as es
import elasticsearch_dsl as dsl
import requests
import time
OPENED = 'opened'
CLOSED = 'closed'
ISSUES = 'issues'
MRS = 'merge_requests'
GITLAB_API_URL = "https://gitlab.com/api/v4"
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host",
help="Elasticsearch host")
parser.add_argument("-i", "--index",
help="Index name")
parser.add_argument("-o", "--origins", nargs='+',
help="Origin name")
parser.add_argument("-t", "--tokens", nargs='+',
help="Gitlab tokens")
parser.add_argument("-s", "--states", default=[OPENED, CLOSED], nargs='+',
help="Issue/MRs states")
parser.add_argument("-c", "--category", choices=[ISSUES, MRS], default=ISSUES,
help="Category issue/mrs")
parser.add_argument('--gte-iids', default=None,
help="Consider iids greater or equal than the target one")
parser.add_argument('--target-iid', default=None,
help="Find specific iid")
args = parser.parse_args()
return args
def get_iids_from_index(es_host, index, origin, states):
"""Get all iids in the index.
:param es_host: ES host
:param index: index
:param origin: origin
:param states: issue states either closed or opened
:return: list of the MRs iid
"""
project_id = None
client = es.Elasticsearch(es_host, timeout=90)
s = dsl.Search(using=client, index=index).query("match", origin=origin)
s = s.source(['data.iid', 'data.state', 'data.project_id'])
index_iid = []
for hit in s.scan():
project_id = hit['data']['project_id']
if hit['data']['state'] in states:
index_iid.append(hit['data']['iid'])
index_iid.sort()
return index_iid, project_id
def no_exist(tokens, index, category, gte_iids, target_iid, project_id):
tpos = 0
removed = []
header = {'PRIVATE-TOKEN': tokens[tpos]}
for i in index:
if gte_iids:
if i < gte_iids:
continue
if target_iid:
if i == target_iid:
break
url = "{}/projects/{}/{}/{}".format(GITLAB_API_URL, project_id, category, i)
re = requests.get(url=url, headers=header)
msg = "{} OK".format(url)
if re.status_code != 200:
removed.append(i)
msg = "{} not found".format(url)
print(msg)
ratelimit_remaining = int(re.headers['RateLimit-Remaining'])
ratelimit_reset = int(re.headers['RateLimit-Reset'])
if ratelimit_remaining < 5:
tpos += 1
if tpos > len(tokens):
tpos = 0
header = {'PRIVATE-TOKEN': tokens[tpos]}
if ratelimit_remaining < 2:
while datetime.datetime.now() < datetime.datetime.fromtimestamp(ratelimit_reset):
time.sleep(1)
return removed
def main():
"""Print a list of the iid that not in the index comparing with origin."""
args = parse_args()
index = args.index
es_host = args.host
origins = args.origins
states = args.states
category = args.category
gte_iids = args.gte_iids
target_iid = args.target_iid
for origin in origins:
print("*** *** *** ***")
print("start: {}".format(origin))
index_iids, project_id = get_iids_from_index(es_host, index, origin, states)
print("items in raw: {}".format(len(index_iids)))
tokens = args.tokens
removed = no_exist(tokens, index_iids, category, gte_iids, target_iid, project_id)
bucket = []
for i in range(len(removed)):
if i % 10 == 0 and bucket:
print(bucket)
bucket = []
bucket.append(removed[i])
if bucket:
print(bucket)
print("not found: {}".format(len(removed)))
print("end: {}".format(origin))
print("*** *** *** ***")
if __name__ == '__main__':
"""
--host https://...
-i index
-o origin-1, origin-2, origin-3, ...
-t token-1 token-2
"""
main()
"""
POST /<index>/_delete_by_query?refresh
{
"query": {
"bool": {
"must": {
"term": {
"origin": "<origin>"
}
},
"filter": {
"terms": {
"id_in_repo": [<value-1>, <value-2]
}
}
}
}
}
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment