Created
July 1, 2013 00:09
-
-
Save whiteclover/5897586 to your computer and use it in GitHub Desktop.
TouHou Project Artist urls Fetch xml parser demo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# this shell is uesed for fetching TouHouProject artists's urls ,and prepares for fetch albumss and songs | |
import urllib2 | |
import HTMLParser | |
from urlparse import urljoin | |
FILTERURLS = ["/cgi-bin/feedback", '?C=N;O=D', "?C=M;O=A", "?C=S;O=A", "?C=D;O=A", "/", "cp_images.sh", "fix_unicode.sh", "mv_images.sh", "ren13.py"] | |
BASEURL = "http://kuukunen.net:8080/toh/" | |
class ArtistLinksParser(HTMLParser.HTMLParser): | |
def __init__(self): | |
HTMLParser.HTMLParser.__init__(self) | |
self.recording = 0 | |
self.url = None | |
self.ArtistUrls = [] | |
def handle_starttag(self, tag, attrs): | |
if tag != 'a': | |
return | |
if self.recording: | |
self.recording += 1 | |
return | |
for name, value in attrs: | |
if name == 'href' and value not in FILTERURLS: | |
print value | |
self.url = urljoin(BASEURL, value) | |
break | |
else: | |
return | |
self.recording = 1 | |
def handle_endtag(self, tag): | |
if tag == 'a' and self.recording: | |
self.recording -= 1 | |
def handle_data(self, data): | |
if self.recording: | |
arist = data.strip("/[]") | |
self.ArtistUrls.append([arist, self.url]) | |
def write2file(self, filename): | |
with open(filename, "w") as f: | |
for arist,url in self.ArtistUrls: | |
f.write(arist) | |
f.write(" , ") | |
f.write(url) | |
f.write("\n") | |
if __name__ == "__main__": | |
p = ArtistLinksParser() | |
f = urllib2.urlopen(BASEURL) | |
html = f.read() | |
p.feed(html) | |
p.write2file("aristurls.csv") | |
p.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment