Skip to content

Instantly share code, notes, and snippets.

@powerswitch
Created June 19, 2014 09:38
Show Gist options
  • Save powerswitch/6ed106ddb50b4e354995 to your computer and use it in GitHub Desktop.
Save powerswitch/6ed106ddb50b4e354995 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from lxml import etree
from itertools import chain
import os, re
URL_PREFIX='http://de.wikipedia.org/'
URL=URL_PREFIX+'w/index.php?title=Spezial%3AAlle+Seiten&from=&to=&namespace=0'
wpentries = dict()
running = True
while URL and running:
xml = etree.parse(URL, parser = etree.HTMLParser())
URL = URL_PREFIX + xml.xpath(".//div[@class='mw-allpages-nav']/a")[-1].get('href')
print(URL)
for result in xml.xpath(".//table[@class = 'mw-allpages-table-chunk']"):
for link in result.xpath(".//a[@class = 'mw-redirect']"):
if link.get('href') in wpentries:
running = False
else:
wpentries[link.get('href')] = link.text
with open('wpindex.csv') as f:
for i in wpentries:
f.write(i+","+wpentries[i]+"\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment