aculich · March 21, 2024 04:51 · kleinias · Dec 3, 2017 · luk-f-a · May 27, 2018
diff --git a/wikipedia-infoboxes-in-pandas.ipynb b/wikipedia-infoboxes-in-pandas.ipynb
diff --git a/wikipedia-infoboxes-in-pandas.py b/wikipedia-infoboxes-in-pandas.py
 # -*- coding: utf-8 -*-
 # <nbformat>3.0</nbformat>

 # <codecell>

 from pandas.io.html import read_html
 page = 'https://en.wikipedia.org/wiki/University_of_California,_Berkeley'
 infoboxes = read_html(page, index_col=0, infer_types=False, attrs={"class":"infobox"})
 wikitables = read_html(page, index_col=0, infer_types=False, attrs={"class":"wikitable"})

 print "Extracted {num} infoboxes".format(num=len(infoboxes))
 print "Extracted {num} wikitables".format(num=len(wikitables))

 # <codecell>

 infoboxes[0]

 # <codecell>

 infoboxes[1]

 # <codecell>

 wikitables[0]

 # <codecell>

 wikitables[1]

 # <markdowncell>

 # The `infer_types=False` argument is needed to turn off automatic type inference for Pandas <0.14, otherwise if date-like text appears in the table the data type will automatically be inferred as a date for the whole column, not just the particular entry, resulting in a table full of `NaT`s for non-date entries. In version >=0.14 the `infer_types` argument will be removed so it will no longer cause this kind of problem.

 # <codecell>

 malformed = read_html(page, index_col=0, attrs={"class":"infobox"})
 malformed[0]

 # <markdowncell>

 # The `index_col=0` argument will turn the zeroth-column into a set of labels for the table rows which is what we want for infoboxes and some (but not all) wikitables.
 # 
 # Using `index_col` means that we can then refer directly to the the entries by their label, e.g.:

 # <codecell>

 infoboxes[0].xs(u'Motto').values[0]

 # <markdowncell>

 # Leaving out the argument the labels will instead be a numeric index and the zeroth-column will be part of the data.

 # <codecell>

 no_lefthand_labels = read_html(page, infer_types=False, attrs={"class":"infobox"})
 no_lefthand_labels[0]
	# -- coding: utf-8 --
	# <nbformat>3.0</nbformat>

	# <codecell>

	from pandas.io.html import read_html
	page = 'https://en.wikipedia.org/wiki/University_of_California,_Berkeley'
	infoboxes = read_html(page, index_col=0, infer_types=False, attrs={"class":"infobox"})
	wikitables = read_html(page, index_col=0, infer_types=False, attrs={"class":"wikitable"})

	print "Extracted {num} infoboxes".format(num=len(infoboxes))
	print "Extracted {num} wikitables".format(num=len(wikitables))

	# <codecell>

	infoboxes[0]

	# <codecell>

	infoboxes[1]

	# <codecell>

	wikitables[0]

	# <codecell>

	wikitables[1]

	# <markdowncell>

	# The `infer_types=False` argument is needed to turn off automatic type inference for Pandas <0.14, otherwise if date-like text appears in the table the data type will automatically be inferred as a date for the whole column, not just the particular entry, resulting in a table full of `NaT`s for non-date entries. In version >=0.14 the `infer_types` argument will be removed so it will no longer cause this kind of problem.

	# <codecell>

	malformed = read_html(page, index_col=0, attrs={"class":"infobox"})
	malformed[0]

	# <markdowncell>

	# The `index_col=0` argument will turn the zeroth-column into a set of labels for the table rows which is what we want for infoboxes and some (but not all) wikitables.
	#
	# Using `index_col` means that we can then refer directly to the the entries by their label, e.g.:

	# <codecell>

	infoboxes[0].xs(u'Motto').values[0]

	# <markdowncell>

	# Leaving out the argument the labels will instead be a numeric index and the zeroth-column will be part of the data.

	# <codecell>

	no_lefthand_labels = read_html(page, infer_types=False, attrs={"class":"infobox"})
	no_lefthand_labels[0]