Skip to content

Instantly share code, notes, and snippets.

@shuxiang
Last active August 31, 2015 05:31
Show Gist options
  • Save shuxiang/637748ea392143f00fb9 to your computer and use it in GitHub Desktop.
Save shuxiang/637748ea392143f00fb9 to your computer and use it in GitHub Desktop.
get words from shanbay.com
#coding=utf8
"""pip install gevent pyquery requests weasyprint"""
import gevent
from gevent import monkey
monkey.patch_all()
from pyquery import PyQuery as pq
import requests
import re, sys
from weasyprint import HTML
domain = "http://shanbay.com"
word_book = "http://shanbay.com/wordbook/97900/"
css_file = "typography.css"
output_name = "1368"
# 1156 http://shanbay.com/wordbook/14/
# 1368 http://shanbay.com/wordbook/97900/
def get_html():
# get word =====================================================
word_lists = pq(requests.get(word_book).content)('.wordbook-wordlist-name a')
head = """<!DOCTYPE html>
<html>
<head lang="en">
<meta charset="UTF-8">
<title>{{ title }} </title>
<!--<link rel="stylesheet" href="./%s" />-->
</head>
<body>
<div class="container">
"""%css_file
tail = """</div></body></html>"""
word_file = file('./%s.html'%output_name, 'w+')
word_file.write(head)
def each_list(i, v):
url = domain + pq(v).attr('href')
word_list = requests.get(url).content
m = re.search(r'Math\.ceil\((\d+)\s{0,}\/\s{0,}(\d+)\)', word_list)
m1, m2 = int(m.group(1)), int(m.group(2))
pages = (m1/m2 + 1) if m1%m2 > 0 else m1/m2
print url, pages, '...............'
for i in range(pages):
content = requests.get(url+"?page=%s"%i).content
word_file.write('<table class="table table-bordered table-striped">')
word_file.write(pq(content)('.span8 table').html().encode('utf8'))
word_file.write('</table>')
word_lists.each(each_list)
word_file.write(tail)
word_file.close()
def render_pdf():
# to pdf ===================================================
word_file = file('./%s.html'%output_name, 'r')
print 'rendering pdf ...............'
html_out = word_file.read()
HTML(string=html_out).write_pdf('./%s.pdf'%output_name, stylesheets=[css_file])
word_file.close()
if __name__ == '__main__':
cmd = sys.argv[1] if len(sys.argv) > 1 else ''
if cmd == 'html':
get_html()
elif cmd == 'pdf':
render_pdf()
else:
get_html()
render_pdf()
/* --------------------------------------------------------------
typography.css
* Sets up some sensible default typography.
-------------------------------------------------------------- */
/* Default font settings.
The font-size percentage is of 16px. (0.75 * 16px = 12px) */
html { font-size:100.01%; }
body {
font-size: 75%;
color: #222;
background: #fff;
font-family: "Helvetica Neue", Arial, Helvetica, sans-serif;
}
/* Headings
-------------------------------------------------------------- */
h1,h2,h3,h4,h5,h6 { font-weight: normal; color: #111; }
h1 { font-size: 3em; line-height: 1; margin-bottom: 0.5em; }
h2 { font-size: 2em; margin-bottom: 0.75em; }
h3 { font-size: 1.5em; line-height: 1; margin-bottom: 1em; }
h4 { font-size: 1.2em; line-height: 1.25; margin-bottom: 1.25em; }
h5 { font-size: 1em; font-weight: bold; margin-bottom: 1.5em; }
h6 { font-size: 1em; font-weight: bold; }
h1 img, h2 img, h3 img,
h4 img, h5 img, h6 img {
margin: 0;
}
/* Text elements
-------------------------------------------------------------- */
p { margin: 0 0 1.5em; }
/*
These can be used to pull an image at the start of a paragraph, so
that the text flows around it (usage: <p><img class="left">Text</p>)
*/
.left { float: left !important; }
p .left { margin: 1.5em 1.5em 1.5em 0; padding: 0; }
.right { float: right !important; }
p .right { margin: 1.5em 0 1.5em 1.5em; padding: 0; }
a:focus,
a:hover { color: #09f; }
a { color: #06c; text-decoration: underline; }
blockquote { margin: 1.5em; color: #666; font-style: italic; }
strong,dfn { font-weight: bold; }
em,dfn { font-style: italic; }
sup, sub { line-height: 0; }
abbr,
acronym { border-bottom: 1px dotted #666; }
address { margin: 0 0 1.5em; font-style: italic; }
del { color:#666; }
pre { margin: 1.5em 0; white-space: pre; }
pre,code,tt { font: 1em 'andale mono', 'lucida console', monospace; line-height: 1.5; }
/* Lists
-------------------------------------------------------------- */
li ul,
li ol { margin: 0; }
ul, ol { margin: 0 1.5em 1.5em 0; padding-left: 1.5em; }
ul { list-style-type: disc; }
ol { list-style-type: decimal; }
dl { margin: 0 0 1.5em 0; }
dl dt { font-weight: bold; }
dd { margin-left: 1.5em;}
/* Tables
-------------------------------------------------------------- */
/*
Because of the need for padding on TH and TD, the vertical rhythm
on table cells has to be 27px, instead of the standard 18px or 36px
of other elements.
*/
table { margin-bottom: 1.4em; width:100%; }
th { font-weight: bold; }
thead th { background: #c3d9ff; }
th,td,caption { padding: 4px 10px 4px 5px; }
/*
You can zebra-stripe your tables in outdated browsers by adding
the class "even" to every other table row.
*/
tbody tr:nth-child(even) td,
tbody tr.even td {
background: #e5ecf9;
}
tfoot { font-style: italic; }
caption { background: #eee; }
/* Misc classes
-------------------------------------------------------------- */
.small { font-size: .8em; margin-bottom: 1.875em; line-height: 1.875em; }
.large { font-size: 1.2em; line-height: 2.5em; margin-bottom: 1.25em; }
.hide { display: none; }
.quiet { color: #666; }
.loud { color: #000; }
.highlight { background:#ff0; }
.added { background:#060; color: #fff; }
.removed { background:#900; color: #fff; }
.first { margin-left:0; padding-left:0; }
.last { margin-right:0; padding-right:0; }
.top { margin-top:0; padding-top:0; }
.bottom { margin-bottom:0; padding-bottom:0; }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment