Created
April 3, 2013 14:53
-
-
Save t2psyto/5301908 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #-*- coding: utf-8 -*- | |
| filename = "c:/TEMP/gihyosd_2001-2012_index.html" | |
| from StringIO import StringIO | |
| import gzip | |
| import BeautifulSoup as BS | |
| import urllib2 | |
| # workaround for beautifulsoup bug. | |
| # info: | |
| # http://sipbo.info/?p=123 | |
| import sys | |
| if not hasattr(sys, "setdefaultencoding"): | |
| # workaround for reload(sys) | |
| # info: | |
| # http://masaland.cocolog-nifty.com/blog/2010/12/python-s60-20-1.html | |
| wk = sys.stdout | |
| wk2 = sys.stderr | |
| reload(sys) | |
| sys.stdout = wk | |
| sys.stderr = wk2 | |
| sys.setdefaultencoding("utf-8") | |
| #soup = None | |
| #toc = None | |
| def main(): | |
| urlbase = "http://gihyo.jp/magazine/SD/archive/" | |
| #global toc | |
| #global soup | |
| toc = [] | |
| for year in range(2001, 2013): | |
| for month in range(1, 13): | |
| url = urlbase + "%d/%d%02d" % (year, year, month) | |
| print url, "->", | |
| try: | |
| data = urllib2.urlopen(url).read() | |
| print "ok." | |
| except urllib2.HTTPError: | |
| data = '<div id="toc">-- 404 NOT FOUND --<div>' | |
| print "nothing." | |
| try: | |
| fdata = gzip.GzipFile(fileobj=StringIO(data)).read() | |
| except IOError: | |
| fdata = data | |
| soup = BS.BeautifulSoup(fdata) | |
| sectiontag = "<h1>%d-%02d</h1>\n" % (year, month) | |
| urltag = '<a href="%s">%s</a>\n' % (url, url) | |
| toc.append(sectiontag + urltag + unicode(soup.findAll("div", {"id":"toc"})[0])) | |
| print "" | |
| print "fileout ->", filename | |
| header = """<!DOCTYPE html> | |
| <html lang="ja"> | |
| <head> | |
| <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> | |
| <title>Software Design 2001-2012 index</title> | |
| <link href="http://twitter.github.com/bootstrap/assets/css/bootstrap.css" rel="stylesheet"> | |
| </head> | |
| <body> | |
| """ | |
| footer = """ | |
| </body> | |
| </html> | |
| """ | |
| uhtml = header + "\n".join(toc) + footer | |
| strhtml = uhtml.encode("utf-8") | |
| fout = file(filename,"wb") | |
| fout.write(strhtml) | |
| fout.close() | |
| print "done." | |
| if __name__ == "__main__": | |
| main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment