-
-
Save wassname/5b10774dfcd61cdd3f28 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*- | |
""" | |
Scrape a table from wikipedia using python. Allows for cells spanning multiple rows and/or columns. Outputs csv files for | |
each table | |
url: https://gist.github.com/wassname/5b10774dfcd61cdd3f28 | |
authors: panford, wassname, muzzled, Yossi | |
license: MIT | |
""" | |
from bs4 import BeautifulSoup | |
import requests | |
import os | |
import codecs | |
wiki = "https://en.wikipedia.org/wiki/International_Phonetic_Alphabet_chart_for_English_dialects" | |
header = { | |
'User-Agent': 'Mozilla/5.0' | |
} # Needed to prevent 403 error on Wikipedia | |
page = requests.get(wiki, headers=header) | |
soup = BeautifulSoup(page.content) | |
tables = soup.findAll("table", {"class": "wikitable"}) | |
# show tables | |
for i, table in enumerate(tables): | |
print("#"*10 + "Table {}".format(i) + '#'*10) | |
print(table.text[:100]) | |
print('.'*80) | |
print("#"*80) | |
for tn, table in enumerate(tables): | |
# preinit list of lists | |
rows = table.findAll("tr") | |
row_lengths = [len(r.findAll(['th', 'td'])) for r in rows] | |
ncols = max(row_lengths) | |
nrows = len(rows) | |
data = [] | |
for i in range(nrows): | |
rowD = [] | |
for j in range(ncols): | |
rowD.append('') | |
data.append(rowD) | |
# process html | |
for i in range(len(rows)): | |
row = rows[i] | |
rowD = [] | |
cells = row.findAll(["td", "th"]) | |
for j in range(len(cells)): | |
cell = cells[j] | |
#lots of cells span cols and rows so lets deal with that | |
cspan = int(cell.get('colspan', 1)) | |
rspan = int(cell.get('rowspan', 1)) | |
l = 0 | |
for k in range(rspan): | |
# Shifts to the first empty cell of this row | |
while data[i + k][j + l]: | |
l += 1 | |
for m in range(cspan): | |
cell_n = j + l + m | |
row_n = i + k | |
# in some cases the colspan can overflow the table, in those cases just get the last item | |
cell_n = min(cell_n, len(data[row_n])-1) | |
data[row_n][cell_n] += cell.text | |
print(cell.text) | |
data.append(rowD) | |
# write data out to tab seperated format | |
page = os.path.split(wiki)[1] | |
fname = 'output_{}_t{}.tsv'.format(page, tn) | |
f = codecs.open(fname, 'w') | |
for i in range(nrows): | |
rowStr = '\t'.join(data[i]) | |
rowStr = rowStr.replace('\n', '') | |
print(rowStr) | |
f.write(rowStr + '\n') | |
f.close() |
when I am trying to use the code, for the link "https://en.wikipedia.org/wiki/Loganair",I am facing the below error.Can you please help me to solve this.
Traceback (most recent call last):
File "", line 65, in
while data[i + k][j + l]:
IndexError: list index out of range
Worked for me, and I notice your lined number doesn't correspond to the latest version. Perhaps try the latest code?
I am trying to extract infobox information..so i used class="infobox"..for some wikipedia pages it works..but for this type of company infobox it throws error.
#37 to #42 could be replaced by : data = [[''] * ncols for i in range(nrows)]
#45: for i in range(nrows):
#47, #68: all instance of rowD should be removed
#73: f = codecs.open(fname, mode='w', encoding='utf-8') <- to deal with non-English characters
Thank you very much for this great gist!!!
Thanks, I added that change.