Skip to content

Instantly share code, notes, and snippets.

@w0ng
Created October 7, 2014 05:49
Show Gist options
  • Save w0ng/e21c96309ad50643723d to your computer and use it in GitHub Desktop.
Save w0ng/e21c96309ad50643723d to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import urllib.request
import codecs
import json
import re
# 'Pretty print'. Outputs stuff nicer than normal print. Good for lists
from pprint import pprint
url = 'https://en.wikipedia.org/w/api.php?action=parse&page=List_of_national_capitals_in_alphabetical_order&prop=wikitext&format=json'
response = urllib.request.urlopen(url)
zzz = codecs.getreader('utf-8')
json = json.load(zzz(response))
data = json['parse']
title = data['title']
text = data['wikitext']['*']
#print(text)
text_list = text.split('\n')
#pprint(text_list)
# We want lines that begin with "| [["
# For example: "| [[Warsaw]]..."
relevant_lines = [line for line in text_list if re.match('^\|\ \[\[', line)]
#pprint(relevant_lines)
# Put in dictionary. e.g. capital['AUS'] = 'Canberra'
# City is the first matching [[fdasfdasf]] of each line
# Country is the matching {{fdafsafd}} of each line
country_regex = '{{(.*?)}}'
city_regex = '\[\[(.*?)\]\]'
capital = {}
for line in relevant_lines:
if re.search(country_regex, line):
country = re.search(country_regex, line).group(1)
if re.search(city_regex, line):
city = re.search(city_regex, line).group(1)
if country and city:
capital[country] = city
for country_name,city_name in sorted(capital.items()):
# capital[country] = city_name
print(country_name + ": " + city_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment