gibbbone · June 12, 2018 23:51
diff --git a/export_wikipedia_tables_to_pandas.py b/export_wikipedia_tables_to_pandas.py
 # -*- coding: utf-8 -*-
 """
 Procedure to scrape a table from wikipedia using python. Uses MediaWikiAPI to get page content. 
 Allows for cells spanning multiple rows and/or columns. Outputs a Pandas dataframe. 

 Page used for testing (second table in particular): 
 https://en.wikipedia.org/wiki/International_Phonetic_Alphabet_chart_for_English_dialects
 """

 import pandas as pd
 from bs4 import BeautifulSoup
 from mediawikiapi import MediaWikiAPI

 def wikitable_to_dataframe(table):
    """
    Exports a Wikipedia table parsed by BeautifulSoup. Deals with spanning: 
    multirow and multicolumn should format as expected. 
    """ 
    rows=table.findAll("tr")
    nrows=len(rows)
    ncols=max([len(r.findAll(['th','td'])) for r in rows])

    # preallocate table structure
    # (this is required because we need to move forward in the table
    # structure once we've found a row span)
    data=[]
    for i in range(nrows):
        rowD=[]
        for j in range(ncols):
            rowD.append('')
        data.append(rowD)

    # fill the table with data:
    # move across cells and use span to fill extra cells
    for i,row in enumerate(rows):    
        cells = row.findAll(["td","th"])
        for j,cell in enumerate(cells):        
            cspan=int(cell.get('colspan',1))
            rspan=int(cell.get('rowspan',1))
            l = 0
            for k in range(rspan):
                # Shifts to the first empty cell of this row
                # Avoid replacing previously insterted content
                while data[i+k][j+l]:
                    l+=1
                for m in range(cspan):
                    data[i+k][j+l+m]+=cell.text.strip("\n")

    return pd.DataFrame(data)


 mediawikiapi = MediaWikiAPI()
 test_page = mediawikiapi.page(pageid=1569939)
 # to check page URL: 
 # print(test_page.url)
 soup = BeautifulSoup(test_page.html(), 'html.parser')
 tables = soup.findAll("table", { "class" : "wikitable" })
 df_test = wikitable_to_dataframe(tables[1])
	# -- coding: utf-8 --
	"""
	Procedure to scrape a table from wikipedia using python. Uses MediaWikiAPI to get page content.
	Allows for cells spanning multiple rows and/or columns. Outputs a Pandas dataframe.

	Page used for testing (second table in particular):
	https://en.wikipedia.org/wiki/International_Phonetic_Alphabet_chart_for_English_dialects
	"""

	import pandas as pd
	from bs4 import BeautifulSoup
	from mediawikiapi import MediaWikiAPI

	def wikitable_to_dataframe(table):
	"""
	Exports a Wikipedia table parsed by BeautifulSoup. Deals with spanning:
	multirow and multicolumn should format as expected.
	"""
	rows=table.findAll("tr")
	nrows=len(rows)
	ncols=max([len(r.findAll(['th','td'])) for r in rows])

	# preallocate table structure
	# (this is required because we need to move forward in the table
	# structure once we've found a row span)
	data=[]
	for i in range(nrows):
	rowD=[]
	for j in range(ncols):
	rowD.append('')
	data.append(rowD)

	# fill the table with data:
	# move across cells and use span to fill extra cells
	for i,row in enumerate(rows):
	cells = row.findAll(["td","th"])
	for j,cell in enumerate(cells):
	cspan=int(cell.get('colspan',1))
	rspan=int(cell.get('rowspan',1))
	l = 0
	for k in range(rspan):
	# Shifts to the first empty cell of this row
	# Avoid replacing previously insterted content
	while data[i+k][j+l]:
	l+=1
	for m in range(cspan):
	data[i+k][j+l+m]+=cell.text.strip("\n")

	return pd.DataFrame(data)


	mediawikiapi = MediaWikiAPI()
	test_page = mediawikiapi.page(pageid=1569939)
	# to check page URL:
	# print(test_page.url)
	soup = BeautifulSoup(test_page.html(), 'html.parser')
	tables = soup.findAll("table", { "class" : "wikitable" })
	df_test = wikitable_to_dataframe(tables[1])