Skip to content

Instantly share code, notes, and snippets.

@Alfex4936
Created December 12, 2023 10:41
Show Gist options
  • Save Alfex4936/2ef795b0976548ecb73a5bf3fe2b338e to your computer and use it in GitHub Desktop.
Save Alfex4936/2ef795b0976548ecb73a5bf3fe2b338e to your computer and use it in GitHub Desktop.
Download Josen dynasty annals
import re
import requests
import urllib3
from selectolax.parser import HTMLParser
urllib3.disable_warnings()
def cleanup(tree):
for node in tree.css("sup"):
node.decompose()
for class_name in [".ins_footnote", ".ins_source", ".ins_view_line"]:
for node in tree.css(class_name):
node.decompose()
text = tree.text(deep=True, separator="", strip=False)
# 중국어 없애기
text = re.sub(r"[^\uAC00-\uD7A3\s\d.,!?;:]", "", text)
text = re.sub(r"\.\s+", ". ", text)
text = re.sub(r" {2,}", "", text)
return text.strip()
def get_page(session, url):
try:
page = session.get(url, timeout=30)
page.raise_for_status()
return page.content
except requests.exceptions.HTTPError as e:
print(f"HTTP error: {e}")
except requests.exceptions.RequestException as e:
print(f"Error: {e}")
return None
def main():
url_base = "http://sillok.history.go.kr/id/" # 조선실록 DB 웹사이트
ids = [
# "kaa_10107017_001",
# "kba_10101001_001",
# "kca_10101001_001",
# "kda_10008011_001",
"kea_10002018_001",
"kfa_10005014_001",
"kga_10106111_001",
"kha_10009007_001",
"kia_10011028_001",
"kja_10012025_001",
"kka_10109002_001",
"kla_10101001_001",
"kma_10007007_001",
"kna_10007004_001",
"knb_10007003_001",
"koa_10002001_001",
"kob_10002001_001",
"kpa_10103013_001",
"kqa_10005008_001",
"kra_10005004_001",
"krb_10005004_001",
"ksa_10008018_001",
"ksb_10008026_001",
"kta_10006008_001",
"ktb_10006008_001",
"kua_10008030_001",
"kva_10003010_001",
"kwa_10007004_001",
"kxa_10011018_001",
"kya_10006009_001",
]
session = requests.Session()
session.verify = False
idx = 0
previous_title = None
file_handle = None
doc_id = ids[idx]
while idx < len(ids):
url = url_base + doc_id
print("페이지 얻는 중:", url)
html_doc = get_page(session, url)
if html_doc:
tree = HTMLParser(html_doc)
title_node = tree.css_first(".tit_loc")
if title_node:
extracted_title = title_node.text(separator=" ", strip=True).rstrip()
title_parts = extracted_title.split(",", 1)
main_title = title_parts[0].strip()
kor = tree.css_first(".ins_view.ins_view_left")
if kor:
kor_text = cleanup(kor)
# 제목이 변했는지
if main_title != previous_title:
if file_handle:
file_handle.close()
# 권마다 새로운 파일 이름 ex) 태조실록_1권.txt
file_name = f"{main_title.replace(' ', '_')}.txt"
file_handle = open(
f"archieve/{file_name}", "a+", encoding="utf-8"
)
file_handle.write(f"{main_title}\n")
file_handle.write("=" * 5 + "\n")
previous_title = main_title
# 파일에 내용 쓰기
if len(title_parts) > 1:
additional_title = title_parts[1].strip()
file_handle.write(f"{additional_title}: {kor_text}\n")
else:
file_handle.write(f"{kor_text}\n")
try:
doc_id = tree.css("ul.view_btnset.mt_-20 li a")[1].attributes["href"][
19:-3
]
except:
idx += 1
if idx < len(ids):
doc_id = ids[idx]
print(f"Next book: {doc_id}")
else:
print("All documents processed.")
break
else:
print("Failed to get page, skipping.")
idx += 1
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment