Skip to content

Instantly share code, notes, and snippets.

@uranusjr
Created June 18, 2013 13:31
Show Gist options
  • Save uranusjr/5805368 to your computer and use it in GitHub Desktop.
Save uranusjr/5805368 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib
import HTMLParser
OUTPUT_PATH = None
URL = None
# Set URL here
URL = 'http://m-p.sakura.ne.jp/Html/anime.html#anime'
# Set output file path here
OUTPUT_PATH = '/Users/uranusjr/Desktop/o.txt'
class MoonPhaseParser(HTMLParser.HTMLParser, object):
def __init__(self):
super(MoonPhaseParser, self).__init__()
self.table_row = 0
self.table_column = 0
self.current_href = ''
self.output = ''
def handle_starttag(self, tag, attrs):
if tag == 'tr':
self.handle_tr_start(attrs)
elif tag == 'td':
self.handle_td_start(attrs)
elif tag == 'a':
for attr in attrs:
if attr[0] == 'href':
self.current_href = attr[1]
break
def handle_data(self, data):
if self.table_row == 1:
return
if self.table_column == 1:
data = data.strip()
if data:
self.output += data + ' '
elif self.table_column == 2:
data = data.strip()
data = ' '.join([c for c in data.split()])
if data:
self.output += data + '\n' + self.current_href + '\n\n'
else:
return
def handle_tr_start(self, attrs):
self.table_row += 1
self.table_column = 0 # reset column count
def handle_td_start(self, attrs):
self.table_column += 1
def main():
content = unicode(urllib.urlopen(URL).read(), 'shift-jis', 'ignore')
parser = MoonPhaseParser()
parser.feed(content)
if not OUTPUT_PATH:
print parser.output
else:
with open(OUTPUT_PATH, 'w+') as f:
f.write(parser.output.encode('utf-8'))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment