Last active
December 16, 2015 09:58
-
-
Save AJRenold/5416406 to your computer and use it in GitHub Desktop.
Find a wikipedia article. Example called at bottom.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup, NavigableString, Tag | |
from urllib2 import urlopen | |
import json | |
def check_dbpedia(term): | |
api = 'http://lookup.dbpedia.org/api/search.asmx/KeywordSearch?MaxHits=10&QueryString=' | |
#api = 'http://lookup.dbpedia.org/api/search.asmx/PrefixSearch?MaxHits=10&QueryString=' | |
response = urlopen(api+term) | |
soup = BeautifulSoup(response.read()) | |
urls = [] | |
for result in soup.findAll('result'): | |
for child in result.children: | |
if isinstance(child,Tag): | |
if child.name == 'label': | |
current_label = child.string | |
if child.name == 'uri': | |
urls.append({ 'label': current_label, 'url': child.string }) | |
#print urls | |
## exact match | |
found = "" | |
for url in urls: | |
if url['label'] == term: | |
url['match'] = 'exact' | |
found = url | |
## no exact match | |
if found == "": | |
found = urls[0:3] | |
for url in found: | |
url['match'] = 'partial' | |
return found | |
def wiki_url(url): | |
term = url[url.rfind('/'):] | |
entity_page = 'http://dbpedia.org/data/{}.json'.format(term) | |
#print(entity_page) | |
wiki_type = 'http://xmlns.com/foaf/0.1/primaryTopic' | |
response = urlopen(entity_page) | |
data = json.loads(response.read()) | |
for key,value in data.items(): | |
'http://xmlns.com/foaf/0.1/primaryTopic' | |
#print("key",value) | |
if 'http://xmlns.com/foaf/0.1/primaryTopic' in value: | |
return key | |
def get_wiki_url(term): | |
results = check_dbpedia(term) | |
#print results | |
wikis = {} | |
if type(results) == dict: | |
wiki = wiki_url(results['url']) | |
wikis['urls'] = [wiki] | |
wikis['match'] = 'exact' | |
elif len(results) == 0: | |
wikis['match'] = 'none' | |
else: | |
wikis['match'] = 'partial' | |
wikis['urls'] = [] | |
for result in results: | |
wiki = wiki_url(result['url']) | |
wikis['urls'].append(wiki) | |
return wikis | |
urls = get_wiki_url('HTTP') |
import pandas as pd
import mwparserfromhell
from wikitools import wiki
from wikitools import api
from wikitools import category
wikisite = "http://en.wikipedia.org/w/api.php"
parse_category = "United States"
wikiObject = wiki.Wiki(wikisite)
wikiCategory = category.Category(wikiObject, parse_category)
articles = wikiCategory.getAllMembersGen(namespaces=[0])
#this is not an efficient solution, but it is clean...
#only works with previously organized info from wikipedia (templates)
for article in articles:
wikiraw = article.getWikiText()
wikiraw= wikiraw.decode('UTF-8')
parsedWikiText = mwparserfromhell.parse(wikiraw)
for x in parsedWikiText.nodes:
if "template" in str(type(x)):
if "Infobox" in str(x.name):
print x.get('conventional_long_name')
print x.get('area_km2')
print x.get('GDP_nominal_per_capita')
print x.get('HDI')
print x.get('Gini')
print x.get('population_density_km2')
print x.get('latd')
print x.get('latm')
print x.get('latNS')
print x.get('longd')
print x.get('longm')
print x.get('longEW')
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
wikisite = "http://en.wikipedia.org/w/api.php"
parse_category = "USA"
wikiObject = wiki.Wiki(wikisite)
wikiCategory = category.Category(wikiObject, parse_category)
articles = wikiCategory.getAllMembersGen(namespaces=[0])
for article in articles:
wikiraw = article.getWikiText()
wikiraw= wikiraw.decode('UTF-8')
parsedWikiText = mwparserfromhell.parse(wikiraw)
indexOfIb = parsedWikiText.find('Infobox')
if indexOfIb>0:
print index
#for x in parsedWikiText.nodes:
# print x
print parsedWikiText.filter_templates('Infobox')