Last active
August 31, 2015 05:31
-
-
Save shuxiang/637748ea392143f00fb9 to your computer and use it in GitHub Desktop.
get words from shanbay.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=utf8 | |
"""pip install gevent pyquery requests weasyprint""" | |
import gevent | |
from gevent import monkey | |
monkey.patch_all() | |
from pyquery import PyQuery as pq | |
import requests | |
import re, sys | |
from weasyprint import HTML | |
domain = "http://shanbay.com" | |
word_book = "http://shanbay.com/wordbook/97900/" | |
css_file = "typography.css" | |
output_name = "1368" | |
# 1156 http://shanbay.com/wordbook/14/ | |
# 1368 http://shanbay.com/wordbook/97900/ | |
def get_html(): | |
# get word ===================================================== | |
word_lists = pq(requests.get(word_book).content)('.wordbook-wordlist-name a') | |
head = """<!DOCTYPE html> | |
<html> | |
<head lang="en"> | |
<meta charset="UTF-8"> | |
<title>{{ title }} </title> | |
<!--<link rel="stylesheet" href="./%s" />--> | |
</head> | |
<body> | |
<div class="container"> | |
"""%css_file | |
tail = """</div></body></html>""" | |
word_file = file('./%s.html'%output_name, 'w+') | |
word_file.write(head) | |
def each_list(i, v): | |
url = domain + pq(v).attr('href') | |
word_list = requests.get(url).content | |
m = re.search(r'Math\.ceil\((\d+)\s{0,}\/\s{0,}(\d+)\)', word_list) | |
m1, m2 = int(m.group(1)), int(m.group(2)) | |
pages = (m1/m2 + 1) if m1%m2 > 0 else m1/m2 | |
print url, pages, '...............' | |
for i in range(pages): | |
content = requests.get(url+"?page=%s"%i).content | |
word_file.write('<table class="table table-bordered table-striped">') | |
word_file.write(pq(content)('.span8 table').html().encode('utf8')) | |
word_file.write('</table>') | |
word_lists.each(each_list) | |
word_file.write(tail) | |
word_file.close() | |
def render_pdf(): | |
# to pdf =================================================== | |
word_file = file('./%s.html'%output_name, 'r') | |
print 'rendering pdf ...............' | |
html_out = word_file.read() | |
HTML(string=html_out).write_pdf('./%s.pdf'%output_name, stylesheets=[css_file]) | |
word_file.close() | |
if __name__ == '__main__': | |
cmd = sys.argv[1] if len(sys.argv) > 1 else '' | |
if cmd == 'html': | |
get_html() | |
elif cmd == 'pdf': | |
render_pdf() | |
else: | |
get_html() | |
render_pdf() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* -------------------------------------------------------------- | |
typography.css | |
* Sets up some sensible default typography. | |
-------------------------------------------------------------- */ | |
/* Default font settings. | |
The font-size percentage is of 16px. (0.75 * 16px = 12px) */ | |
html { font-size:100.01%; } | |
body { | |
font-size: 75%; | |
color: #222; | |
background: #fff; | |
font-family: "Helvetica Neue", Arial, Helvetica, sans-serif; | |
} | |
/* Headings | |
-------------------------------------------------------------- */ | |
h1,h2,h3,h4,h5,h6 { font-weight: normal; color: #111; } | |
h1 { font-size: 3em; line-height: 1; margin-bottom: 0.5em; } | |
h2 { font-size: 2em; margin-bottom: 0.75em; } | |
h3 { font-size: 1.5em; line-height: 1; margin-bottom: 1em; } | |
h4 { font-size: 1.2em; line-height: 1.25; margin-bottom: 1.25em; } | |
h5 { font-size: 1em; font-weight: bold; margin-bottom: 1.5em; } | |
h6 { font-size: 1em; font-weight: bold; } | |
h1 img, h2 img, h3 img, | |
h4 img, h5 img, h6 img { | |
margin: 0; | |
} | |
/* Text elements | |
-------------------------------------------------------------- */ | |
p { margin: 0 0 1.5em; } | |
/* | |
These can be used to pull an image at the start of a paragraph, so | |
that the text flows around it (usage: <p><img class="left">Text</p>) | |
*/ | |
.left { float: left !important; } | |
p .left { margin: 1.5em 1.5em 1.5em 0; padding: 0; } | |
.right { float: right !important; } | |
p .right { margin: 1.5em 0 1.5em 1.5em; padding: 0; } | |
a:focus, | |
a:hover { color: #09f; } | |
a { color: #06c; text-decoration: underline; } | |
blockquote { margin: 1.5em; color: #666; font-style: italic; } | |
strong,dfn { font-weight: bold; } | |
em,dfn { font-style: italic; } | |
sup, sub { line-height: 0; } | |
abbr, | |
acronym { border-bottom: 1px dotted #666; } | |
address { margin: 0 0 1.5em; font-style: italic; } | |
del { color:#666; } | |
pre { margin: 1.5em 0; white-space: pre; } | |
pre,code,tt { font: 1em 'andale mono', 'lucida console', monospace; line-height: 1.5; } | |
/* Lists | |
-------------------------------------------------------------- */ | |
li ul, | |
li ol { margin: 0; } | |
ul, ol { margin: 0 1.5em 1.5em 0; padding-left: 1.5em; } | |
ul { list-style-type: disc; } | |
ol { list-style-type: decimal; } | |
dl { margin: 0 0 1.5em 0; } | |
dl dt { font-weight: bold; } | |
dd { margin-left: 1.5em;} | |
/* Tables | |
-------------------------------------------------------------- */ | |
/* | |
Because of the need for padding on TH and TD, the vertical rhythm | |
on table cells has to be 27px, instead of the standard 18px or 36px | |
of other elements. | |
*/ | |
table { margin-bottom: 1.4em; width:100%; } | |
th { font-weight: bold; } | |
thead th { background: #c3d9ff; } | |
th,td,caption { padding: 4px 10px 4px 5px; } | |
/* | |
You can zebra-stripe your tables in outdated browsers by adding | |
the class "even" to every other table row. | |
*/ | |
tbody tr:nth-child(even) td, | |
tbody tr.even td { | |
background: #e5ecf9; | |
} | |
tfoot { font-style: italic; } | |
caption { background: #eee; } | |
/* Misc classes | |
-------------------------------------------------------------- */ | |
.small { font-size: .8em; margin-bottom: 1.875em; line-height: 1.875em; } | |
.large { font-size: 1.2em; line-height: 2.5em; margin-bottom: 1.25em; } | |
.hide { display: none; } | |
.quiet { color: #666; } | |
.loud { color: #000; } | |
.highlight { background:#ff0; } | |
.added { background:#060; color: #fff; } | |
.removed { background:#900; color: #fff; } | |
.first { margin-left:0; padding-left:0; } | |
.last { margin-right:0; padding-right:0; } | |
.top { margin-top:0; padding-top:0; } | |
.bottom { margin-bottom:0; padding-bottom:0; } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment