Last active
March 31, 2018 02:23
-
-
Save wareya/0d2ab07b6c6306967c54f309c2a24cf4 to your computer and use it in GitHub Desktop.
narou scraper with persistent knowledge of updates (not fully tested) - use also: https://gist.github.com/wareya/4305e2f971a78c960402ac69f308128c
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!python | |
| from bs4 import BeautifulSoup | |
| import urllib | |
| from urllib.parse import urljoin | |
| import sys | |
| import aiohttp | |
| import asyncio | |
| import os.path | |
| import re | |
| import sqlite3 | |
| database = sqlite3.connect("naroudb.db") | |
| c = database.cursor() | |
| c.execute("create table if not exists narou (ncode text, title text, chapcode text, chapter int, datetime text, content text)") | |
| c.execute("create unique index if not exists idx_chapcode on narou (chapcode)") | |
| arguments = [] | |
| if len(sys.argv) < 2: | |
| import yomou | |
| arguments = yomou.get_top_300("http://yomou.syosetu.com/rank/list/type/total_total/") | |
| elif sys.argv[1] == "--titles": | |
| titles = c.execute("select ncode, title from narou").fetchall() | |
| if titles != None: | |
| for title in titles: | |
| print(f"{title[0]}; {title[1]}") | |
| else: | |
| arguments = sys.argv[1:] | |
| print("note: chapter downloads aren't persistent until all updates are downloaded") | |
| for argument in arguments: | |
| print(f"ripping {argument}") | |
| mainurl = argument | |
| if "https://" not in mainurl and "http://" not in mainurl: | |
| mainurl = "https://ncode.syosetu.com/" + mainurl | |
| ncode = mainurl.rstrip("/").rsplit('/', 1)[-1] | |
| r = urllib.request.urlopen(mainurl) | |
| data = r.read() | |
| r.close() | |
| soup = BeautifulSoup(data, "html.parser") | |
| title = soup.select(".novel_title")[0].get_text().strip() | |
| chapterurls = [] | |
| chaptertimes = [] | |
| for li in soup.select(".index_box .novel_sublist2 .subtitle a"): | |
| chapterurls += [urljoin(mainurl, li.get("href"))] | |
| for dt in soup.select(".index_box .novel_sublist2 .long_update"): | |
| chaptertimes += [re.search("([0-9]{4}/[0-1][0-9]/[0-9]{2} [0-2][0-9]:[0-5][0-9])", dt.get_text())[1]] | |
| if len(chaptertimes) != len(chapterurls): | |
| print("Assert: len(chaptertimes) != len(chapterurls)") | |
| exit() | |
| nofetch = [] | |
| count = len(chapterurls) | |
| for i in range(len(chapterurls)): | |
| url = chapterurls[i] | |
| chapter = url.rstrip("/").rsplit('/', 1)[-1] | |
| chapcode = ncode+"-"+chapter | |
| time = chaptertimes[i] | |
| knowntime = c.execute("select datetime from narou where chapcode=?", (chapcode,)).fetchone() | |
| if knowntime != None: | |
| knowntime = knowntime[0] | |
| if knowntime == time and time != None: | |
| nofetch += [url] | |
| for delete in nofetch: | |
| index = chapterurls.index(delete) | |
| del chapterurls[index] | |
| del chaptertimes[index] | |
| datas = [""] * len(chapterurls) | |
| texts = [""] * len(chapterurls) | |
| retrycount = 0 | |
| async def fetch(session, url): | |
| i = 0 | |
| while i < retrycount or retrycount <= 0: | |
| try: | |
| # depending on how far you are from japan and how bad your internet is, you might need to raise this 3 to something like a 5 or an 8 - but the higher it is, the greater the number of connections that get trapped | |
| async with session.get(url, timeout=3) as response: | |
| return await response.text() | |
| except asyncio.TimeoutError: | |
| #print("retrying a connection") | |
| continue | |
| async def load_chapter(session, url, index): | |
| data = await fetch(session, url) | |
| print(f"loaded {url}") | |
| datas[index] = data | |
| async def load_all_chapters(): | |
| connector = aiohttp.TCPConnector(ttl_dns_cache=100000000, limit=100, force_close=True, enable_cleanup_closed=True) | |
| async with aiohttp.ClientSession(connector=connector) as session: | |
| tasks = [] | |
| i = 0 | |
| for url in chapterurls: | |
| tasks.append(load_chapter(session, url, i)) | |
| i += 1 | |
| responses = asyncio.gather(*tasks) | |
| await responses | |
| loop = asyncio.get_event_loop() | |
| future = asyncio.ensure_future(load_all_chapters()) | |
| loop.run_until_complete(future) | |
| for index in range(len(datas)): | |
| soup = BeautifulSoup(datas[index], "html.parser") | |
| for entry in soup.select("#novel_honbun"): | |
| [rt.extract() for rt in entry.findAll("rt")] | |
| [rp.extract() for rp in entry.findAll("rp")] | |
| texts[index] = entry.get_text() | |
| print("writing to database...") | |
| for index in range(len(chapterurls)): | |
| url = chapterurls[index] | |
| chapter = url.rstrip("/").rsplit('/', 1)[-1] | |
| chapcode = ncode+"-"+chapter | |
| time = chaptertimes[index] | |
| text = texts[index] | |
| c.execute("insert or replace into narou values (?,?,?,?,?,?)", (ncode, title, chapcode, int(chapter), time, text)) | |
| database.commit() | |
| print("done. writing to file...") | |
| outputs = c.execute("select * from narou where ncode=? order by chapter asc", (ncode,)).fetchall() | |
| if outputs != None: | |
| realoutputs = [] | |
| for output in outputs: | |
| realoutputs += [output[-1]] | |
| f = open("scripts/"+ncode+".txt", "w", encoding="utf-8", newline="\n") | |
| f.write("\n\n\n".join(realoutputs).replace("《", "«").replace("》", "»").replace("〈", "‹").replace("〉", "›")) | |
| f.close() | |
| print("done.") | |
| database.commit() | |
| c.close() | |
| database.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment