Skip to content

Instantly share code, notes, and snippets.

@michaeljoseph
Last active December 24, 2015 19:39
Show Gist options
  • Save michaeljoseph/6851909 to your computer and use it in GitHub Desktop.
Save michaeljoseph/6851909 to your computer and use it in GitHub Desktop.
wikistates using scraperwiki
#!/usr/bin/env python
from pprint import pprint
from pyquery import PyQuery as pq
import requests
import scraperwiki
DDL = {
'create': """
CREATE TABLE IF NOT EXISTS `sovereign_states` (
`href` text,
`absolute_url` text,
`title` text
);
""",
'drop': 'DROP TABLE IF EXISTS `sovereign_states`;'
}
scraperwiki.sql.execute(DDL['drop'])
scraperwiki.sql.execute(DDL['create'])
wikipedia = 'http://en.wikipedia.org'
states_url = '%s/wiki/List_of_sovereign_states' % wikipedia
expression = 'table > tr > td > b > a'
html_content = requests.get(states_url).content
document = pq(html_content)
state_data = []
for link_element in document(expression):
attrs = link_element.attrib
country = {
'title': attrs['title'],
'href': attrs['href'],
'absolute_url': '%s%s' % (wikipedia, attrs['href']),
}
state_data.append(country)
pprint(state_data)
scraperwiki.sql.save(['title'], state_data, table_name='sovereign_states')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment