Skip to content

Instantly share code, notes, and snippets.

@vinipsmaker
Created July 1, 2016 12:19
Show Gist options
  • Save vinipsmaker/903cbf27eecf48a27e243a1b8aae068d to your computer and use it in GitHub Desktop.
Save vinipsmaker/903cbf27eecf48a27e243a1b8aae068d to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
from io import StringIO
import requests
from lxml import etree
from urllib.parse import urlparse, urlunparse, urlencode, parse_qsl
f = requests.request('GET', 'http://rpg-design-patterns.speedykitty.com/doku.php/start?do=index')
f = StringIO(str(f.content))
tree = etree.parse(f, etree.HTMLParser())
path_query = ('//html'
+ '/body'
+ '/div[@class = "dokuwiki"]'
+ '/div[@class = "section"]'
+ '/div[@class = "article"]'
+ '/div[@id = "content-area"]'
+ '/div'
+ '/div[@id = "index__tree"]'
+ '/ul[@class = "idx"]'
+ '/li[@class = "level1"]'
+ '/div[@class = "li"]'
+ '/a[@class = "wikilink1"]'
+ '/@href')
pages = tree.xpath(path_query)
path_query2 = ('//html'
+ '/body'
+ '/div[@class = "dokuwiki"]'
+ '/div[@class = "section"]'
+ '/div[@class = "article"]'
+ '/div[@id = "content-area"]'
+ '/div'
+ '/div[@id = "index__tree"]'
+ '/ul[@class = "idx"]'
+ '/li[@class = "closed"]'
+ '/div[@class = "li"]'
+ '/a[@class = "idx_dir"]'
+ '/@href')
pages2 = tree.xpath(path_query2)
for p in pages2:
path_query3 = ('//html'
+ '/body'
+ '/div[@class = "dokuwiki"]'
+ '/div[@class = "section"]'
+ '/div[@class = "article"]'
+ '/div[@id = "content-area"]'
+ '/div'
+ '/div[@id = "index__tree"]'
+ '/ul[@class = "idx"]'
+ '/li[@class = "open"]'
+ '/ul[@class = "idx"]'
+ '/li[@class = "level2"]'
+ '/div[@class = "li"]'
+ '/a[@class = "wikilink1"]'
+ '/@href')
f2 = requests.request('GET', 'http://rpg-design-patterns.speedykitty.com' + str(p))
f2 = StringIO(str(f2.content))
tree2 = etree.parse(f2, etree.HTMLParser())
pages += tree2.xpath(path_query3)
pages2 = []
for p in pages:
parsed = urlparse(p, 'http')
params = parse_qsl(parsed.query) + [('do', 'export_raw')]
pages2 += [(urlunparse((parsed.scheme,
'rpg-design-patterns.speedykitty.com',
parsed.path,
None,
urlencode(params),
"")),
parsed.path.split('/')[-1])]
for (p, idx) in pages2:
res = requests.request('GET', p)
f = open(idx, 'wb')
f.write(res.content)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment