Skip to content

Instantly share code, notes, and snippets.

@t2psyto
Created April 3, 2013 14:53
Show Gist options
  • Select an option

  • Save t2psyto/5301908 to your computer and use it in GitHub Desktop.

Select an option

Save t2psyto/5301908 to your computer and use it in GitHub Desktop.
#-*- coding: utf-8 -*-
filename = "c:/TEMP/gihyosd_2001-2012_index.html"
from StringIO import StringIO
import gzip
import BeautifulSoup as BS
import urllib2
# workaround for beautifulsoup bug.
# info:
# http://sipbo.info/?p=123
import sys
if not hasattr(sys, "setdefaultencoding"):
# workaround for reload(sys)
# info:
# http://masaland.cocolog-nifty.com/blog/2010/12/python-s60-20-1.html
wk = sys.stdout
wk2 = sys.stderr
reload(sys)
sys.stdout = wk
sys.stderr = wk2
sys.setdefaultencoding("utf-8")
#soup = None
#toc = None
def main():
urlbase = "http://gihyo.jp/magazine/SD/archive/"
#global toc
#global soup
toc = []
for year in range(2001, 2013):
for month in range(1, 13):
url = urlbase + "%d/%d%02d" % (year, year, month)
print url, "->",
try:
data = urllib2.urlopen(url).read()
print "ok."
except urllib2.HTTPError:
data = '<div id="toc">-- 404 NOT FOUND --<div>'
print "nothing."
try:
fdata = gzip.GzipFile(fileobj=StringIO(data)).read()
except IOError:
fdata = data
soup = BS.BeautifulSoup(fdata)
sectiontag = "<h1>%d-%02d</h1>\n" % (year, month)
urltag = '<a href="%s">%s</a>\n' % (url, url)
toc.append(sectiontag + urltag + unicode(soup.findAll("div", {"id":"toc"})[0]))
print ""
print "fileout ->", filename
header = """<!DOCTYPE html>
<html lang="ja">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>Software Design 2001-2012 index</title>
<link href="http://twitter.github.com/bootstrap/assets/css/bootstrap.css" rel="stylesheet">
</head>
<body>
"""
footer = """
</body>
</html>
"""
uhtml = header + "\n".join(toc) + footer
strhtml = uhtml.encode("utf-8")
fout = file(filename,"wb")
fout.write(strhtml)
fout.close()
print "done."
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment