Skip to content

Instantly share code, notes, and snippets.

@BeMg
Created December 11, 2019 05:29
Show Gist options
  • Select an option

  • Save BeMg/833edb631dea5276fdaec6bf8a4b9469 to your computer and use it in GitHub Desktop.

Select an option

Save BeMg/833edb631dea5276fdaec6bf8a4b9469 to your computer and use it in GitHub Desktop.
download all text from UU
import requests
from bs4 import BeautifulSoup
import sys
from opencc import OpenCC
def s2t(text):
cc = OpenCC('s2t')
converted = cc.convert(text)
return converted
def get_html_source(link):
r = requests.get(link)
r.encoding = 'gb2312'
return r.text
def extract_content(link):
raw_html = get_html_source(link)
s = BeautifulSoup(raw_html, 'html.parser')
title = s.find(id='timu')
content = s.find(id='contentbox')
return title.text, content.text
def get_from_table(link):
root = link[:link.rfind('/')]
r = get_html_source(link)
s = BeautifulSoup(r, 'html.parser')
ss = s.find("ul", id="chapterList")
sss = ss.find_all("a", href=True)
sss.reverse()
return sss
if __name__ == "__main__":
start = sys.argv[1]
chaplists = get_from_table(start)
root = "https://www.uukanshu.com"
links = [root + chap['href'] for chap in chaplists]
all_text = ""
for link in links:
title, content = extract_content(link)
print(title)
all_text += title
all_text += "\n\n"
all_text += content
all_text += "\n\n"
all_text = s2t(all_text)
with open("tmp.txt", "w") as f:
f.write(all_text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment