Created
June 27, 2020 18:43
-
-
Save harej/9ce360452e4728eb1020bd89e35e1dec to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
import sys | |
from multiprocessing.dummy import Pool as ThreadPool | |
from wikidataintegrator import wdi_core, wdi_login | |
from wikidataintegrator.wdi_core import WDItemEngine | |
mediawiki_api_url = 'https://iagraph.wiki.opencura.com/w/api.php' | |
sparql_endpoint_url = 'https://iagraph.wiki.opencura.com/query/sparql' | |
login = wdi_login.WDLogin( | |
user='USERNAME', | |
pwd='PASSWORD', | |
mediawiki_api_url=mediawiki_api_url) | |
get_item = WDItemEngine.wikibase_item_engine_factory(mediawiki_api_url, | |
sparql_endpoint_url) | |
def remove_dupe_dicts(l): | |
list_of_strings = [ | |
json.dumps(d, sort_keys=True) | |
for d in l | |
] | |
list_of_strings = set(list_of_strings) | |
return [ | |
json.loads(s) | |
for s in list_of_strings | |
] | |
def run_dedupe(wb_id): | |
print('.', end='', flush=True) | |
try: | |
r = requests.get('https://iagraph.wiki.opencura.com/wiki/Special:EntityData/' + wb_id + '.json') | |
blob = r.json()['entities'][wb_id] | |
except: | |
return | |
edit_made = False | |
for prop_nr, list_of_claims in blob['claims'].items(): | |
for num, claim in enumerate(list_of_claims): | |
original_references = claim['references'] | |
blob['claims'][prop_nr][num]['references'] = remove_dupe_dicts(claim['references']) | |
if len(original_references) > len(blob['claims'][prop_nr][num]['references']): | |
edit_made = True | |
if edit_made == True: | |
# Deleting additional fields added by MediaWiki but not used in WikidataIntegrator | |
del blob['pageid'] | |
del blob['ns'] | |
del blob['title'] | |
del blob['lastrevid'] | |
del blob['modified'] | |
del blob['type'] | |
del blob['id'] | |
item = get_item(wd_item_id=wb_id, new_item=False, global_ref_mode='STRICT_OVERWRITE') | |
item.wd_json_representation = blob | |
try: | |
new_wb_id = item.write(login) | |
print(new_wb_id, flush=True) | |
except Exception as e: | |
print(e, flush=True) | |
def main(): | |
start = 1 | |
finish = 180000 | |
if sys.argv[-1][0] == 'Q': | |
run_dedupe(sys.argv[-1]) | |
else: | |
pool = ThreadPool(15) | |
results = pool.map(run_dedupe, ['Q' + str(n) for n in range(start, finish)]) | |
pool.close() | |
pool.join() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment