Skip to content

Instantly share code, notes, and snippets.

@kamikat
Created February 16, 2015 14:50
Show Gist options
  • Select an option

  • Save kamikat/3437bdfc187ef3a05f21 to your computer and use it in GitHub Desktop.

Select an option

Save kamikat/3437bdfc187ef3a05f21 to your computer and use it in GitHub Desktop.
Clawer for lightnovel.cn in Discuz!
#!/usr/bin/env python
#-*- coding: utf-8 -*-
from pyquery import PyQuery as pq
from urllib import urlretrieve
from threading import Thread
import sys, os
import argparse
parser = argparse.ArgumentParser(
description="Convert thread from lightnovel.cn to reading form")
parser.add_argument('tid', help='ID to the thread in lightnovel.cn.')
parser.add_argument('-f', '--filter-smaller',
default=140, dest='filter_lower', type=int,
help="Filter the text less than the FILTER_LOWER(140).")
parser.add_argument('-i', '--image-fetch',
default=True, dest='img', action='store_false',
help="Fetch images to local.")
parser.add_argument('-c', '--use-cnc',
default=False, dest='cnc', action='store_true',
help="Use the CNC server of lightnovel.cn")
parser.add_argument('-r', '--no-redirect',
default=False, dest='noredir', action='store_true',
help="Do not use the feature view author only.")
parser.add_argument('-u', '--update',
default=False, dest='update', action='store_true',
help="Update the html file specified by tid.")
parser.add_argument('-s', '--font-size',
default=24, dest='fontsize', type=int,
help="Define font size in pixel(default 24).")
args = parser.parse_args()
if args.update:
f = open(args.tid, "r")
d = pq(f.read())
f.close()
tid_from_file = d("""meta[name="thread"]""").attr.content
if tid_from_file in (None, [], ""):
print "Error: tid not found in file: %s" % args.tid
exit(-1)
args.tid = tid_from_file
if args.cnc:
domain = "http://cnc.lightnovel.cn/"
else:
domain = "http://www.lightnovel.cn/"
url = "forum.php?mod=viewthread&tid=%s&page=1"
url = url % args.tid
print "Fetching %s..." % (domain + url)
d = pq(url= domain + url)
#Get the Title
t = d("#thread_subject")
name = t.text().replace("/","").replace("\\","")
print "Processing %s..." % name
#Filter thread to author-only
if not args.noredir:
auth_filter = d(".pti .authi a")
url = auth_filter.attr("href")
d=None
print "Redirecting..."
#Write file
f = open(name + ".html", "w")
f.write("""<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="thread" content="%s" />
<title>%s</title>
<style type="text/css">
body{
font-family: Tahoma, Helvetica, Arial, sans-serif;
font-size: %dpx;
}
</style>
<script type="text/javascript">
function createCookie(name,value,days) {
if (days) {
var date = new Date();
date.setTime(date.getTime()+(days*24*60*60*1000));
var expires = "; expires="+date.toGMTString();
}
else var expires = "";
document.cookie = name+"="+value+expires+"; path=/";
}
function readCookie(name) {
var nameEQ = name + "=";
var ca = document.cookie.split(';');
for(var i=0;i < ca.length;i++) {
var c = ca[i];
while (c.charAt(0)==' ') c = c.substring(1,c.length);
if (c.indexOf(nameEQ) == 0) return c.substring(nameEQ.length,c.length);
}
return null;
}
function getMeta(Key) {
var metas = document.getElementsByTagName('META'); //get HTML Tag
var i;
for (i = 0; i < metas.length; i++)
if (metas[i].getAttribute('NAME') == Key)
break;
var Test = metas[i].getAttribute('CONTENT');
return Test;
}
var scrollPos;
if(typeof document.compatMode != 'undefined' && document.compatMode != 'BackCompat') {
scrollPos = document.documentElement;
}else if (typeof document.body != 'undefined') {
scrollPos = document.body;
}
function _(dest){
scrollPos.scrollTop=dest;
}
function load(){
progress=readCookie("progress"+getMeta("thread"));
if(progress!=null){
_(progress);
}
};
var prev_update = Date.now();
var sTop = 0;
function scroll(){
if(Date.now() - prev_update > 1000 || Math.abs(scrollPos.scrollTop - sTop) > 30){
sTop = scrollPos.scrollTop;
createCookie("progress" + getMeta("thread"), sTop, 30);
prev_update=Date.now();
}
}
</script>
<link rel="stylesheet" type="text/css" href="style.css"/>
</head>
<body onload="load();" onscroll="scroll();">""" % (args.tid, name.encode("utf-8"), args.fontsize))
def finish(f):
f.write("""</body></html>""")
f.close()
exit(0)
while url != None:
if d==None:
print "Fetching %s..." % (domain + url)
d = pq(url=domain + url)
p = d(".t_f")
p.remove(".pstatus")
for post in p:
s = pq(post)
if len(s.text()) > args.filter_lower:
if args.img:
imgs = s("img")
for i in imgs:
img = pq(i)
if not os.path.isdir(name):
os.mkdir(name)
if img.attr.src == None:
if img.attr.file != None:
img.attr.src=img.attr.file
else:
continue
new_path = os.path.join(name, os.path.basename(img.attr.src))
if not os.path.exists(new_path):
img_url=img.attr.src
if not img_url.startswith("http://"):
img_url = domain + img_url
print "Retrieve image %s" % img_url
Thread(target=urlretrieve,
args=(img_url, new_path)).start()
img.attr.src = new_path
f.write(s.html().encode("utf-8"))
f.write("""<br />""")
else:
finish(f)
url = d(".pg .nxt").attr("href")
d=None;
finish(f)
@kamikat
Copy link
Author

kamikat commented Feb 16, 2015

This application claw the shared novel text from the website.

The script may also applied to other Discuz! forum with a bit of changes to
the source.

You need Python 2.7 to make this script works.

Using easy_install install pyquery to install dependency to pyquery.

Usage

See lightnovel.py --help for detail.

Feature

  • Support both CNC and ChinaNet server of lightnovel.cn
  • Fetch image from the site
  • Remember the last location we read
  • Synchronize the clawed novel text to the online version
  • Filter the text blocks according to text length

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment