tgherzog · October 31, 2023 17:58 · tgherzog · Apr 18, 2022
diff --git a/build.py b/build.py
 import wbgapi as wb
 import pandas as pd
 import json

 # Background: this is an attempt to make part of the data API more consistent and coherent.
 # The API back-end has a mapping table that specifies the canonical database for each
 # indicator when indicators appear in more than one database and the user doesn't explicitly
 # specify a database, e.g.:
 #   https://api.worldbank.org/v2/indicator/SP.POP.TOTL           (database is ambiguous)
 #   https://api.worldbank.org/v2/indicator/SP.POP.TOTL?source=16 (database is clear)
 # The problem with the existing approach is that 1) is is not transparent; 2) it is vulnerable
 # to inconsistent rules, and 3) things behave unexpectedly as indicators are added, removed or
 # moved to WDI archives. 
 # An improved approach implemnted circa June, 2021 is to effectively implement a database hierarchy
 # so that responses would be consistent when the database is ambiguous. We requested that ITS
 # implement a control file with the database hierarchy so that DECDG could reference and update
 # it as necessary. I think in reality, they use the control file to update the original mapping
 # table rather than reference it directly in code.

 # This script shows how the current table is generated. The default hierarchy is simply the order
 # in which databases (sources) are reported by the API, which happens to be sorted by database ID.
 # Databases 2, 63, and 16 (WDI, Human Capital and HNP) are then bumped up in priority, and
 # Africa Development Indicators (sunsetted) and WDI Archives are moved to the bottom.

 # because the neither code nor the resulting table are directly referenced by the API (they
 # are used to regenerate an internal table that we cannot directly access), you still need
 # to do some QA after changing the database hierarchy.

 sources = wb.source.Series()
 df = pd.DataFrame({'name': wb.source.Series()})
 df['ranking'] = df.index.astype('int64') * 10
 df.loc['2', 'ranking'] = 1
 df.loc['63', 'ranking'] = 2
 df.loc['16', 'ranking'] = 3

 bottom = max(df['ranking'].max()+1, 1000)
 df.loc['11', 'ranking'] = bottom
 df.loc['57', 'ranking'] = bottom + 1

 rankings = [{'sourceId': str(k), 'ranking': str(row['ranking'])} for k,row in df.iterrows()]
 print(json.dumps(rankings))
diff --git a/rankinghierarchy.json b/rankinghierarchy.json
 [{"sourceId": "1", "ranking": "10"}, {"sourceId": "2", "ranking": "1"}, {"sourceId": "3", "ranking": "30"}, {"sourceId": "5", "ranking": "50"}, {"sourceId": "6", "ranking": "60"}, {"sourceId": "11", "ranking": "1000"}, {"sourceId": "12", "ranking": "120"}, {"sourceId": "13", "ranking": "130"}, {"sourceId": "14", "ranking": "140"}, {"sourceId": "15", "ranking": "150"}, {"sourceId": "16", "ranking": "3"}, {"sourceId": "18", "ranking": "180"}, {"sourceId": "19", "ranking": "190"}, {"sourceId": "20", "ranking": "200"}, {"sourceId": "22", "ranking": "220"}, {"sourceId": "23", "ranking": "230"}, {"sourceId": "24", "ranking": "240"}, {"sourceId": "25", "ranking": "250"}, {"sourceId": "27", "ranking": "270"}, {"sourceId": "28", "ranking": "280"}, {"sourceId": "29", "ranking": "290"}, {"sourceId": "30", "ranking": "300"}, {"sourceId": "31", "ranking": "310"}, {"sourceId": "32", "ranking": "320"}, {"sourceId": "33", "ranking": "330"}, {"sourceId": "34", "ranking": "340"}, {"sourceId": "35", "ranking": "350"}, {"sourceId": "36", "ranking": "360"}, {"sourceId": "37", "ranking": "370"}, {"sourceId": "38", "ranking": "380"}, {"sourceId": "39", "ranking": "390"}, {"sourceId": "40", "ranking": "400"}, {"sourceId": "41", "ranking": "410"}, {"sourceId": "43", "ranking": "430"}, {"sourceId": "45", "ranking": "450"}, {"sourceId": "46", "ranking": "460"}, {"sourceId": "50", "ranking": "500"}, {"sourceId": "54", "ranking": "540"}, {"sourceId": "57", "ranking": "1001"}, {"sourceId": "58", "ranking": "580"}, {"sourceId": "59", "ranking": "590"}, {"sourceId": "60", "ranking": "600"}, {"sourceId": "61", "ranking": "610"}, {"sourceId": "62", "ranking": "620"}, {"sourceId": "63", "ranking": "2"}, {"sourceId": "64", "ranking": "640"}, {"sourceId": "65", "ranking": "650"}, {"sourceId": "66", "ranking": "660"}, {"sourceId": "67", "ranking": "670"}, {"sourceId": "68", "ranking": "680"}, {"sourceId": "69", "ranking": "690"}, {"sourceId": "70", "ranking": "700"}, {"sourceId": "71", "ranking": "710"}, {"sourceId": "72", "ranking": "720"}, {"sourceId": "73", "ranking": "730"}, {"sourceId": "75", "ranking": "750"}, {"sourceId": "76", "ranking": "760"}, {"sourceId": "77", "ranking": "770"}, {"sourceId": "78", "ranking": "780"}, {"sourceId": "79", "ranking": "790"}, {"sourceId": "80", "ranking": "800"}, {"sourceId": "81", "ranking": "810"}, {"sourceId": "82", "ranking": "820"}, {"sourceId": "83", "ranking": "830"}, {"sourceId": "84", "ranking": "840"}]
	import wbgapi as wb
	import pandas as pd
	import json

	# Background: this is an attempt to make part of the data API more consistent and coherent.
	# The API back-end has a mapping table that specifies the canonical database for each
	# indicator when indicators appear in more than one database and the user doesn't explicitly
	# specify a database, e.g.:
	# https://api.worldbank.org/v2/indicator/SP.POP.TOTL (database is ambiguous)
	# https://api.worldbank.org/v2/indicator/SP.POP.TOTL?source=16 (database is clear)
	# The problem with the existing approach is that 1) is is not transparent; 2) it is vulnerable
	# to inconsistent rules, and 3) things behave unexpectedly as indicators are added, removed or
	# moved to WDI archives.
	# An improved approach implemnted circa June, 2021 is to effectively implement a database hierarchy
	# so that responses would be consistent when the database is ambiguous. We requested that ITS
	# implement a control file with the database hierarchy so that DECDG could reference and update
	# it as necessary. I think in reality, they use the control file to update the original mapping
	# table rather than reference it directly in code.

	# This script shows how the current table is generated. The default hierarchy is simply the order
	# in which databases (sources) are reported by the API, which happens to be sorted by database ID.
	# Databases 2, 63, and 16 (WDI, Human Capital and HNP) are then bumped up in priority, and
	# Africa Development Indicators (sunsetted) and WDI Archives are moved to the bottom.

	# because the neither code nor the resulting table are directly referenced by the API (they
	# are used to regenerate an internal table that we cannot directly access), you still need
	# to do some QA after changing the database hierarchy.

	sources = wb.source.Series()
	df = pd.DataFrame({'name': wb.source.Series()})
	df['ranking'] = df.index.astype('int64') * 10
	df.loc['2', 'ranking'] = 1
	df.loc['63', 'ranking'] = 2
	df.loc['16', 'ranking'] = 3

	bottom = max(df['ranking'].max()+1, 1000)
	df.loc['11', 'ranking'] = bottom
	df.loc['57', 'ranking'] = bottom + 1

	rankings = [{'sourceId': str(k), 'ranking': str(row['ranking'])} for k,row in df.iterrows()]
	print(json.dumps(rankings))