Created
December 11, 2019 05:29
-
-
Save BeMg/833edb631dea5276fdaec6bf8a4b9469 to your computer and use it in GitHub Desktop.
download all text from UU
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from bs4 import BeautifulSoup | |
| import sys | |
| from opencc import OpenCC | |
| def s2t(text): | |
| cc = OpenCC('s2t') | |
| converted = cc.convert(text) | |
| return converted | |
| def get_html_source(link): | |
| r = requests.get(link) | |
| r.encoding = 'gb2312' | |
| return r.text | |
| def extract_content(link): | |
| raw_html = get_html_source(link) | |
| s = BeautifulSoup(raw_html, 'html.parser') | |
| title = s.find(id='timu') | |
| content = s.find(id='contentbox') | |
| return title.text, content.text | |
| def get_from_table(link): | |
| root = link[:link.rfind('/')] | |
| r = get_html_source(link) | |
| s = BeautifulSoup(r, 'html.parser') | |
| ss = s.find("ul", id="chapterList") | |
| sss = ss.find_all("a", href=True) | |
| sss.reverse() | |
| return sss | |
| if __name__ == "__main__": | |
| start = sys.argv[1] | |
| chaplists = get_from_table(start) | |
| root = "https://www.uukanshu.com" | |
| links = [root + chap['href'] for chap in chaplists] | |
| all_text = "" | |
| for link in links: | |
| title, content = extract_content(link) | |
| print(title) | |
| all_text += title | |
| all_text += "\n\n" | |
| all_text += content | |
| all_text += "\n\n" | |
| all_text = s2t(all_text) | |
| with open("tmp.txt", "w") as f: | |
| f.write(all_text) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment