Created
February 16, 2015 14:50
-
-
Save kamikat/3437bdfc187ef3a05f21 to your computer and use it in GitHub Desktop.
Clawer for lightnovel.cn in Discuz!
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| #-*- coding: utf-8 -*- | |
| from pyquery import PyQuery as pq | |
| from urllib import urlretrieve | |
| from threading import Thread | |
| import sys, os | |
| import argparse | |
| parser = argparse.ArgumentParser( | |
| description="Convert thread from lightnovel.cn to reading form") | |
| parser.add_argument('tid', help='ID to the thread in lightnovel.cn.') | |
| parser.add_argument('-f', '--filter-smaller', | |
| default=140, dest='filter_lower', type=int, | |
| help="Filter the text less than the FILTER_LOWER(140).") | |
| parser.add_argument('-i', '--image-fetch', | |
| default=True, dest='img', action='store_false', | |
| help="Fetch images to local.") | |
| parser.add_argument('-c', '--use-cnc', | |
| default=False, dest='cnc', action='store_true', | |
| help="Use the CNC server of lightnovel.cn") | |
| parser.add_argument('-r', '--no-redirect', | |
| default=False, dest='noredir', action='store_true', | |
| help="Do not use the feature view author only.") | |
| parser.add_argument('-u', '--update', | |
| default=False, dest='update', action='store_true', | |
| help="Update the html file specified by tid.") | |
| parser.add_argument('-s', '--font-size', | |
| default=24, dest='fontsize', type=int, | |
| help="Define font size in pixel(default 24).") | |
| args = parser.parse_args() | |
| if args.update: | |
| f = open(args.tid, "r") | |
| d = pq(f.read()) | |
| f.close() | |
| tid_from_file = d("""meta[name="thread"]""").attr.content | |
| if tid_from_file in (None, [], ""): | |
| print "Error: tid not found in file: %s" % args.tid | |
| exit(-1) | |
| args.tid = tid_from_file | |
| if args.cnc: | |
| domain = "http://cnc.lightnovel.cn/" | |
| else: | |
| domain = "http://www.lightnovel.cn/" | |
| url = "forum.php?mod=viewthread&tid=%s&page=1" | |
| url = url % args.tid | |
| print "Fetching %s..." % (domain + url) | |
| d = pq(url= domain + url) | |
| #Get the Title | |
| t = d("#thread_subject") | |
| name = t.text().replace("/","").replace("\\","") | |
| print "Processing %s..." % name | |
| #Filter thread to author-only | |
| if not args.noredir: | |
| auth_filter = d(".pti .authi a") | |
| url = auth_filter.attr("href") | |
| d=None | |
| print "Redirecting..." | |
| #Write file | |
| f = open(name + ".html", "w") | |
| f.write("""<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" | |
| "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | |
| <html xmlns="http://www.w3.org/1999/xhtml"> | |
| <head> | |
| <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| <meta name="thread" content="%s" /> | |
| <title>%s</title> | |
| <style type="text/css"> | |
| body{ | |
| font-family: Tahoma, Helvetica, Arial, sans-serif; | |
| font-size: %dpx; | |
| } | |
| </style> | |
| <script type="text/javascript"> | |
| function createCookie(name,value,days) { | |
| if (days) { | |
| var date = new Date(); | |
| date.setTime(date.getTime()+(days*24*60*60*1000)); | |
| var expires = "; expires="+date.toGMTString(); | |
| } | |
| else var expires = ""; | |
| document.cookie = name+"="+value+expires+"; path=/"; | |
| } | |
| function readCookie(name) { | |
| var nameEQ = name + "="; | |
| var ca = document.cookie.split(';'); | |
| for(var i=0;i < ca.length;i++) { | |
| var c = ca[i]; | |
| while (c.charAt(0)==' ') c = c.substring(1,c.length); | |
| if (c.indexOf(nameEQ) == 0) return c.substring(nameEQ.length,c.length); | |
| } | |
| return null; | |
| } | |
| function getMeta(Key) { | |
| var metas = document.getElementsByTagName('META'); //get HTML Tag | |
| var i; | |
| for (i = 0; i < metas.length; i++) | |
| if (metas[i].getAttribute('NAME') == Key) | |
| break; | |
| var Test = metas[i].getAttribute('CONTENT'); | |
| return Test; | |
| } | |
| var scrollPos; | |
| if(typeof document.compatMode != 'undefined' && document.compatMode != 'BackCompat') { | |
| scrollPos = document.documentElement; | |
| }else if (typeof document.body != 'undefined') { | |
| scrollPos = document.body; | |
| } | |
| function _(dest){ | |
| scrollPos.scrollTop=dest; | |
| } | |
| function load(){ | |
| progress=readCookie("progress"+getMeta("thread")); | |
| if(progress!=null){ | |
| _(progress); | |
| } | |
| }; | |
| var prev_update = Date.now(); | |
| var sTop = 0; | |
| function scroll(){ | |
| if(Date.now() - prev_update > 1000 || Math.abs(scrollPos.scrollTop - sTop) > 30){ | |
| sTop = scrollPos.scrollTop; | |
| createCookie("progress" + getMeta("thread"), sTop, 30); | |
| prev_update=Date.now(); | |
| } | |
| } | |
| </script> | |
| <link rel="stylesheet" type="text/css" href="style.css"/> | |
| </head> | |
| <body onload="load();" onscroll="scroll();">""" % (args.tid, name.encode("utf-8"), args.fontsize)) | |
| def finish(f): | |
| f.write("""</body></html>""") | |
| f.close() | |
| exit(0) | |
| while url != None: | |
| if d==None: | |
| print "Fetching %s..." % (domain + url) | |
| d = pq(url=domain + url) | |
| p = d(".t_f") | |
| p.remove(".pstatus") | |
| for post in p: | |
| s = pq(post) | |
| if len(s.text()) > args.filter_lower: | |
| if args.img: | |
| imgs = s("img") | |
| for i in imgs: | |
| img = pq(i) | |
| if not os.path.isdir(name): | |
| os.mkdir(name) | |
| if img.attr.src == None: | |
| if img.attr.file != None: | |
| img.attr.src=img.attr.file | |
| else: | |
| continue | |
| new_path = os.path.join(name, os.path.basename(img.attr.src)) | |
| if not os.path.exists(new_path): | |
| img_url=img.attr.src | |
| if not img_url.startswith("http://"): | |
| img_url = domain + img_url | |
| print "Retrieve image %s" % img_url | |
| Thread(target=urlretrieve, | |
| args=(img_url, new_path)).start() | |
| img.attr.src = new_path | |
| f.write(s.html().encode("utf-8")) | |
| f.write("""<br />""") | |
| else: | |
| finish(f) | |
| url = d(".pg .nxt").attr("href") | |
| d=None; | |
| finish(f) |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This application claw the shared novel text from the website.
The script may also applied to other Discuz! forum with a bit of changes to
the source.
You need Python 2.7 to make this script works.
Using
easy_install install pyqueryto install dependency to pyquery.Usage
See
lightnovel.py --helpfor detail.Feature