-
-
Save kinsleykajiva/2303e00022126982d068656aa71721d4 to your computer and use it in GitHub Desktop.
Scrape a table from wikipedia using python. Allows for cells spanning multiple rows and/or columns. Outputs csv files for each table
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Scrape a table from wikipedia using python. Allows for cells spanning multiple rows and/or columns. Outputs csv files for | |
each table | |
""" | |
from bs4 import BeautifulSoup | |
import urllib2 | |
import os | |
import codecs | |
wiki = "https://en.wikipedia.org/wiki/International_Phonetic_Alphabet_chart_for_English_dialects" | |
header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia | |
req = urllib2.Request(wiki,headers=header) | |
page = urllib2.urlopen(req) | |
soup = BeautifulSoup(page) | |
tables = soup.findAll("table", { "class" : "wikitable" }) | |
# show tables | |
for table in tables: | |
print "###############" | |
print table.text[:100] | |
for tn in range(len(tables)): | |
table=tables[tn] | |
# preinit list of lists | |
rows=table.findAll("tr") | |
row_lengths=[len(r.findAll(['th','td'])) for r in rows] | |
ncols=max(row_lengths) | |
nrows=len(rows) | |
data=[] | |
for i in range(nrows): | |
rowD=[] | |
for j in range(ncols): | |
rowD.append('') | |
data.append(rowD) | |
# process html | |
for i in range(len(rows)): | |
row=rows[i] | |
rowD=[] | |
cells = row.findAll(["td","th"]) | |
for j in range(len(cells)): | |
cell=cells[j] | |
#lots of cells span cols and rows so lets deal with that | |
cspan=int(cell.get('colspan',1)) | |
rspan=int(cell.get('rowspan',1)) | |
for k in range(rspan): | |
for l in range(cspan): | |
data[i+k][j+l]+=cell.text | |
data.append(rowD) | |
# write data out | |
page=os.path.split(wiki)[1] | |
fname='output_{}_t{}.csv'.format(page,tn) | |
f = codecs.open(fname, 'w')#,encoding='utf-8') | |
for i in range(nrows): | |
rowStr=','.join(data[i]) | |
rowStr=rowStr.replace('\n','') | |
print rowStr | |
rowStr=rowStr#.encode('unicode_escape') | |
f.write(rowStr+'\n') | |
f.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thx for code! Helped me to get inspired :)
However, I noticed that the code does not consider, that when e.g. in the first row a cell with rowspan=2 is given, the following row has therefore one column entry less (since the cell from the previous row is already occupying one column).
Hence, this column with the occupied cell has to be skipped.
Here some code to give an idea of what I mean:
col_offset = 0
while data[i][j + col_offset] is not "":
col_offset += 1 # increase offset in case cell is already occupied == not empty
data[i][j + col_offset] = cell_text
So for me adding this into the code solved the problem for me.
P.S.: Somehow adding code into a comment is really ugly. I hope you can nevertheless understand my idea.