Skip to content

Instantly share code, notes, and snippets.

@AJRenold
Last active December 16, 2015 09:58
Show Gist options
  • Save AJRenold/5416406 to your computer and use it in GitHub Desktop.
Save AJRenold/5416406 to your computer and use it in GitHub Desktop.
Find a wikipedia article. Example called at bottom.
from bs4 import BeautifulSoup, NavigableString, Tag
from urllib2 import urlopen
import json
def check_dbpedia(term):
api = 'http://lookup.dbpedia.org/api/search.asmx/KeywordSearch?MaxHits=10&QueryString='
#api = 'http://lookup.dbpedia.org/api/search.asmx/PrefixSearch?MaxHits=10&QueryString='
response = urlopen(api+term)
soup = BeautifulSoup(response.read())
urls = []
for result in soup.findAll('result'):
for child in result.children:
if isinstance(child,Tag):
if child.name == 'label':
current_label = child.string
if child.name == 'uri':
urls.append({ 'label': current_label, 'url': child.string })
#print urls
## exact match
found = ""
for url in urls:
if url['label'] == term:
url['match'] = 'exact'
found = url
## no exact match
if found == "":
found = urls[0:3]
for url in found:
url['match'] = 'partial'
return found
def wiki_url(url):
term = url[url.rfind('/'):]
entity_page = 'http://dbpedia.org/data/{}.json'.format(term)
#print(entity_page)
wiki_type = 'http://xmlns.com/foaf/0.1/primaryTopic'
response = urlopen(entity_page)
data = json.loads(response.read())
for key,value in data.items():
'http://xmlns.com/foaf/0.1/primaryTopic'
#print("key",value)
if 'http://xmlns.com/foaf/0.1/primaryTopic' in value:
return key
def get_wiki_url(term):
results = check_dbpedia(term)
#print results
wikis = {}
if type(results) == dict:
wiki = wiki_url(results['url'])
wikis['urls'] = [wiki]
wikis['match'] = 'exact'
elif len(results) == 0:
wikis['match'] = 'none'
else:
wikis['match'] = 'partial'
wikis['urls'] = []
for result in results:
wiki = wiki_url(result['url'])
wikis['urls'].append(wiki)
return wikis
urls = get_wiki_url('HTTP')
@iaperez
Copy link

iaperez commented Apr 18, 2013

wikisite = "http://en.wikipedia.org/w/api.php"
parse_category = "USA"

wikiObject = wiki.Wiki(wikisite)
wikiCategory = category.Category(wikiObject, parse_category)
articles = wikiCategory.getAllMembersGen(namespaces=[0])
for article in articles:
wikiraw = article.getWikiText()
wikiraw= wikiraw.decode('UTF-8')
parsedWikiText = mwparserfromhell.parse(wikiraw)
indexOfIb = parsedWikiText.find('Infobox')
if indexOfIb>0:
print index
#for x in parsedWikiText.nodes:
# print x

print parsedWikiText.filter_templates('Infobox')

@iaperez
Copy link

iaperez commented Apr 25, 2013


import pandas as pd
import mwparserfromhell
from wikitools import wiki
from wikitools import api
from wikitools import category
wikisite = "http://en.wikipedia.org/w/api.php"
parse_category = "United States"

wikiObject = wiki.Wiki(wikisite) 
wikiCategory = category.Category(wikiObject, parse_category)
articles = wikiCategory.getAllMembersGen(namespaces=[0])

#this is not an efficient solution, but it is clean...
#only works with previously organized info from wikipedia (templates)
for article in articles:
    wikiraw = article.getWikiText()
    wikiraw= wikiraw.decode('UTF-8')
    parsedWikiText = mwparserfromhell.parse(wikiraw)
    for x in parsedWikiText.nodes:
        if "template" in str(type(x)):
            if "Infobox" in str(x.name):
                print x.get('conventional_long_name')
                print x.get('area_km2')
                print x.get('GDP_nominal_per_capita')
                print x.get('HDI')
                print x.get('Gini')
                print x.get('population_density_km2')
                print x.get('latd')
                print x.get('latm')
                print x.get('latNS')
                print x.get('longd')
                print x.get('longm')
                print x.get('longEW')

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment