Skip to content

Instantly share code, notes, and snippets.

@whiteclover
Created July 1, 2013 00:09
Show Gist options
  • Save whiteclover/5897586 to your computer and use it in GitHub Desktop.
Save whiteclover/5897586 to your computer and use it in GitHub Desktop.
TouHou Project Artist urls Fetch xml parser demo
#!/usr/bin/python
# -*- coding: utf-8 -*-
# this shell is uesed for fetching TouHouProject artists's urls ,and prepares for fetch albumss and songs
import urllib2
import HTMLParser
from urlparse import urljoin
FILTERURLS = ["/cgi-bin/feedback", '?C=N;O=D', "?C=M;O=A", "?C=S;O=A", "?C=D;O=A", "/", "cp_images.sh", "fix_unicode.sh", "mv_images.sh", "ren13.py"]
BASEURL = "http://kuukunen.net:8080/toh/"
class ArtistLinksParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.recording = 0
self.url = None
self.ArtistUrls = []
def handle_starttag(self, tag, attrs):
if tag != 'a':
return
if self.recording:
self.recording += 1
return
for name, value in attrs:
if name == 'href' and value not in FILTERURLS:
print value
self.url = urljoin(BASEURL, value)
break
else:
return
self.recording = 1
def handle_endtag(self, tag):
if tag == 'a' and self.recording:
self.recording -= 1
def handle_data(self, data):
if self.recording:
arist = data.strip("/[]")
self.ArtistUrls.append([arist, self.url])
def write2file(self, filename):
with open(filename, "w") as f:
for arist,url in self.ArtistUrls:
f.write(arist)
f.write(" , ")
f.write(url)
f.write("\n")
if __name__ == "__main__":
p = ArtistLinksParser()
f = urllib2.urlopen(BASEURL)
html = f.read()
p.feed(html)
p.write2file("aristurls.csv")
p.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment