Last active
August 24, 2022 02:36
-
-
Save j8jacobs/b4600638f34e7dfccfd007d667c2ec07 to your computer and use it in GitHub Desktop.
Generate ISO Code CSV from Wiki link
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# original link found here: https://gis.stackexchange.com/questions/1047/seeking-full-list-of-iso-alpha-2-and-iso-alpha-3-country-codes | |
import csv | |
import urllib.request as urllib2 | |
from bs4 import BeautifulSoup | |
import sys | |
# helper function to get text and anchor tag text | |
# this removes the new line characters, connects all the elements of the column | |
def format_text(array): | |
return ''.join(array).strip() | |
# get the wiki ISO webpage | |
opener = urllib2.build_opener() | |
opener.addheaders = [('User-agent', 'Mozilla/5.0')] | |
url = 'http://en.wikipedia.org/wiki/ISO_3166-1' | |
page = opener.open(url) | |
soup = BeautifulSoup(page.read()) | |
# "Current Codes" is second table on the page | |
t = soup.findAll('table', {'class' : 'wikitable sortable'})[1] | |
# create a new CSV for the output | |
iso_csv = csv.writer(open('wikipedia-iso-country-codes.csv', 'w')) | |
# get the header rows, write to the CSV | |
header_row = [format_text(th.findAll(text=True)) for th in t.findAll('th')] | |
iso_csv.writerow(header_row) | |
# Iterate over the table pulling out the country table results. Skip the first | |
# row as it contains the already-parsed header information. | |
for row in t.findAll("tr")[1:]: | |
tds = row.findAll('td') | |
raw_cols = [td.findAll(text=True) for td in tds] | |
formatted_cols = [format_text(c) for c in raw_cols] | |
iso_csv.writerow(formatted_cols) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Updates today for BeautifulSoup4 and change to page HTML
soup = BeautifulSoup(page.read(), features="html.parser")
t = soup.findAll('table', {'class' : 'wikitable sortable'})[0]