Created
December 26, 2011 22:41
-
-
Save gka/1522240 to your computer and use it in GitHub Desktop.
prototype for scraper of german party members from wikipedia
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
from wikitools import wiki | |
from wikitools import api | |
site = wiki.Wiki('http://de.wikipedia.org/w/api.php') | |
# | |
# returns a list of categories | |
# | |
def getPageCategories(pages): | |
params = { 'generator': 'categories', 'cllimit': 500, 'titles': title } | |
return | |
# | |
# returns a list of pages that belong to this category | |
# | |
def categoryMembers(cat): | |
params = { 'list': 'categorymembers', 'cmlimit': 500, 'cmtitle': cat } | |
result = query(params) | |
pages = [] | |
subcat = [] | |
res_pages = result['query']['categorymembers'] | |
for res in res_pages: | |
if res['ns'] == 0: pages.append(res) | |
elif res['ns'] == 14: subcat.append(res) | |
return pages, subcat | |
# | |
# performs a simple query | |
# | |
def query(params): | |
params['action'] = 'query' | |
request = api.APIRequest(site, params) | |
result = request.query() | |
return result | |
# parteien definieren | |
p = dict() | |
#p['CDU'] = ['Kategorie:CDU-Mitglied'] | |
#p['SPD'] = ['Kategorie:SPD-Mitglied'] | |
p['GRUENE'] = ['Kategorie:Bündnis-90/Die-Grünen-Mitglied'] | |
#p['CSU'] = ['Kategorie:CSU-Mitglied'] | |
#p['LINKE'] = ['Kategorie:Die-Linke-Mitglied'] | |
#p['PDS'] = ['Kategorie:PDS-Mitglied'] | |
p['FDP'] = ['Kategorie:FDP-Mitglied'] | |
parties = p | |
# 1 alle artikel in den jeweiligen kategorien zusammensuchen (list=categorymembers) | |
# 2 zu jedem artikel die kategorien raussuchen, in denen der artikel noch abgelegt ist (generator=categories) | |
# 3 zu jedem artikel die backlinks speichern (welche artikel verlinken auf den artikel) list=backlinks | |
# backlinks: Politiker > Politiker (direkte verbindung, einseitig) | |
# backlinks: Politiker <> Politiker (direkte verbindung, gegenseitig) | |
# backlinks: Politiker < Kein Politiker > Politiker (indirekte verbindung) | |
# backlinks: Politiker < Kein Politiker < Politiker (schwache indirekte verbindung) | |
# 4 die ausgehenden links aus einem artikel untersuchen prop=links&titles=Main Page | |
for party in parties.keys(): | |
categories = parties[party] | |
for cat in categories: | |
# get members of this category | |
pages, subcat = categoryMembers(cat) | |
print '\n',cat[10:],':\n' | |
for s in subcat: | |
categories.append(s['title']) | |
#print 'subcat: ' + s['title'] | |
for p in pages: | |
print ' ',p['title'] | |
#params = { 'list': 'categorymembers', 'cmlimit': 500, 'cmtitle': 'CDU-Mitglied' } | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment