Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save gibbbone/6ba30ec92894130c67258ffc6e09c9a4 to your computer and use it in GitHub Desktop.
Save gibbbone/6ba30ec92894130c67258ffc6e09c9a4 to your computer and use it in GitHub Desktop.
Procedure to scrape a table from wikipedia using python. Uses MediaWikiAPI to get page content. Allows for cells spanning multiple rows and/or columns. Outputs a Pandas dataframe.
# -*- coding: utf-8 -*-
"""
Procedure to scrape a table from wikipedia using python. Uses MediaWikiAPI to get page content.
Allows for cells spanning multiple rows and/or columns. Outputs a Pandas dataframe.
Page used for testing (second table in particular):
https://en.wikipedia.org/wiki/International_Phonetic_Alphabet_chart_for_English_dialects
"""
import pandas as pd
from bs4 import BeautifulSoup
from mediawikiapi import MediaWikiAPI
def wikitable_to_dataframe(table):
"""
Exports a Wikipedia table parsed by BeautifulSoup. Deals with spanning:
multirow and multicolumn should format as expected.
"""
rows=table.findAll("tr")
nrows=len(rows)
ncols=max([len(r.findAll(['th','td'])) for r in rows])
# preallocate table structure
# (this is required because we need to move forward in the table
# structure once we've found a row span)
data=[]
for i in range(nrows):
rowD=[]
for j in range(ncols):
rowD.append('')
data.append(rowD)
# fill the table with data:
# move across cells and use span to fill extra cells
for i,row in enumerate(rows):
cells = row.findAll(["td","th"])
for j,cell in enumerate(cells):
cspan=int(cell.get('colspan',1))
rspan=int(cell.get('rowspan',1))
l = 0
for k in range(rspan):
# Shifts to the first empty cell of this row
# Avoid replacing previously insterted content
while data[i+k][j+l]:
l+=1
for m in range(cspan):
data[i+k][j+l+m]+=cell.text.strip("\n")
return pd.DataFrame(data)
mediawikiapi = MediaWikiAPI()
test_page = mediawikiapi.page(pageid=1569939)
# to check page URL:
# print(test_page.url)
soup = BeautifulSoup(test_page.html(), 'html.parser')
tables = soup.findAll("table", { "class" : "wikitable" })
df_test = wikitable_to_dataframe(tables[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment