Created
December 12, 2023 10:41
-
-
Save Alfex4936/2ef795b0976548ecb73a5bf3fe2b338e to your computer and use it in GitHub Desktop.
Download Josen dynasty annals
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import requests | |
import urllib3 | |
from selectolax.parser import HTMLParser | |
urllib3.disable_warnings() | |
def cleanup(tree): | |
for node in tree.css("sup"): | |
node.decompose() | |
for class_name in [".ins_footnote", ".ins_source", ".ins_view_line"]: | |
for node in tree.css(class_name): | |
node.decompose() | |
text = tree.text(deep=True, separator="", strip=False) | |
# 중국어 없애기 | |
text = re.sub(r"[^\uAC00-\uD7A3\s\d.,!?;:]", "", text) | |
text = re.sub(r"\.\s+", ". ", text) | |
text = re.sub(r" {2,}", "", text) | |
return text.strip() | |
def get_page(session, url): | |
try: | |
page = session.get(url, timeout=30) | |
page.raise_for_status() | |
return page.content | |
except requests.exceptions.HTTPError as e: | |
print(f"HTTP error: {e}") | |
except requests.exceptions.RequestException as e: | |
print(f"Error: {e}") | |
return None | |
def main(): | |
url_base = "http://sillok.history.go.kr/id/" # 조선실록 DB 웹사이트 | |
ids = [ | |
# "kaa_10107017_001", | |
# "kba_10101001_001", | |
# "kca_10101001_001", | |
# "kda_10008011_001", | |
"kea_10002018_001", | |
"kfa_10005014_001", | |
"kga_10106111_001", | |
"kha_10009007_001", | |
"kia_10011028_001", | |
"kja_10012025_001", | |
"kka_10109002_001", | |
"kla_10101001_001", | |
"kma_10007007_001", | |
"kna_10007004_001", | |
"knb_10007003_001", | |
"koa_10002001_001", | |
"kob_10002001_001", | |
"kpa_10103013_001", | |
"kqa_10005008_001", | |
"kra_10005004_001", | |
"krb_10005004_001", | |
"ksa_10008018_001", | |
"ksb_10008026_001", | |
"kta_10006008_001", | |
"ktb_10006008_001", | |
"kua_10008030_001", | |
"kva_10003010_001", | |
"kwa_10007004_001", | |
"kxa_10011018_001", | |
"kya_10006009_001", | |
] | |
session = requests.Session() | |
session.verify = False | |
idx = 0 | |
previous_title = None | |
file_handle = None | |
doc_id = ids[idx] | |
while idx < len(ids): | |
url = url_base + doc_id | |
print("페이지 얻는 중:", url) | |
html_doc = get_page(session, url) | |
if html_doc: | |
tree = HTMLParser(html_doc) | |
title_node = tree.css_first(".tit_loc") | |
if title_node: | |
extracted_title = title_node.text(separator=" ", strip=True).rstrip() | |
title_parts = extracted_title.split(",", 1) | |
main_title = title_parts[0].strip() | |
kor = tree.css_first(".ins_view.ins_view_left") | |
if kor: | |
kor_text = cleanup(kor) | |
# 제목이 변했는지 | |
if main_title != previous_title: | |
if file_handle: | |
file_handle.close() | |
# 권마다 새로운 파일 이름 ex) 태조실록_1권.txt | |
file_name = f"{main_title.replace(' ', '_')}.txt" | |
file_handle = open( | |
f"archieve/{file_name}", "a+", encoding="utf-8" | |
) | |
file_handle.write(f"{main_title}\n") | |
file_handle.write("=" * 5 + "\n") | |
previous_title = main_title | |
# 파일에 내용 쓰기 | |
if len(title_parts) > 1: | |
additional_title = title_parts[1].strip() | |
file_handle.write(f"{additional_title}: {kor_text}\n") | |
else: | |
file_handle.write(f"{kor_text}\n") | |
try: | |
doc_id = tree.css("ul.view_btnset.mt_-20 li a")[1].attributes["href"][ | |
19:-3 | |
] | |
except: | |
idx += 1 | |
if idx < len(ids): | |
doc_id = ids[idx] | |
print(f"Next book: {doc_id}") | |
else: | |
print("All documents processed.") | |
break | |
else: | |
print("Failed to get page, skipping.") | |
idx += 1 | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment