Created
December 3, 2023 10:34
-
-
Save hlb/cc30b3434e70a6708cb9dd5cccdf9277 to your computer and use it in GitHub Desktop.
iThome 爬蟲程式
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 匯入所需的模組 | |
from requests.exceptions import HTTPError | |
import requests | |
from bs4 import BeautifulSoup | |
import argparse | |
import json | |
import os | |
def extract_article_details(url): | |
""" | |
從文章 URL 中提取詳細資訊。 | |
Args: | |
url (str): 文章的 URL。 | |
Returns: | |
dict: 包含文章詳細資訊的字典。 | |
""" | |
# 定義 HTTP Request Header | |
request_headers = { | |
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 \ | |
(KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36" | |
} | |
# 建立一個 Session 物件 | |
session = requests.Session() | |
try: | |
# 使用 Session 物件發送 GET 請求到指定 URL | |
response = session.get(url, headers=request_headers) | |
response.raise_for_status() # 如果請求失敗則引發錯誤 | |
except requests.exceptions.HTTPError as e: | |
print(f"發送請求時出錯: {e}") | |
return None | |
# 使用 BeautifulSoup 解析響應內容 | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# 從解析後的內容中提取詳細資訊 | |
content = soup.select_one(".qa-panel__content") | |
# 提取各個部分的資訊 | |
title_selector = content.select_one(".qa-header__title") | |
title = title_selector.get_text(strip=True) if title_selector else None | |
series_name_selector = content.select_one(".ir-article__topic a") | |
series_name = series_name_selector.get_text(strip=True) if series_name_selector else None | |
author_name_selector = content.select_one(".ir-article-info__name") | |
author_name = author_name_selector.get_text(strip=True) if author_name_selector else None | |
day_number_selector = content.select_one(".ir-article__topic-count") | |
day_number = day_number_selector.get_text(strip=True) if day_number_selector else None | |
if day_number is not None: | |
try: | |
day_number = int(day_number) | |
except ValueError: | |
day_number = None | |
# 提取主要文章內容 | |
main_content_div = content.select_one(".qa-markdown") | |
if main_content_div: | |
main_content = "\n".join([p.get_text(strip=True) for p in main_content_div.find_all(['p', 'h2', 'h3', 'li', 'pre'])]) | |
else: | |
main_content = None | |
# 將詳細資訊整合到一個字典中 | |
article_details = { | |
"Article Title": title, | |
"Article Content": main_content, | |
"Series Name": series_name, | |
"Author Name": author_name, | |
"Day": day_number, | |
"URL": url | |
} | |
return article_details | |
def sanitize_filename(filename): | |
""" | |
通過移除或替換不允許在檔案或目錄名中的字符來清理檔案名。 | |
Args: | |
filename (str): 原始檔案名。 | |
Returns: | |
str: 清理後的檔案名。 | |
""" | |
# 定義不允許在檔案名中的字符 | |
disallowed_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|'] | |
for char in disallowed_chars: | |
filename = filename.replace(char, "_") # 將不允許的字符替換為下劃線 | |
return filename | |
def save_to_file(article_details): | |
# 如果 'data' 目錄不存在,則創建它 | |
if not os.path.exists('data'): | |
os.makedirs('data') | |
# 使用清理後的系列名稱作為子目錄名 | |
series_name_sanitized = sanitize_filename(article_details['Series Name']) | |
series_dir = os.path.join('data', series_name_sanitized) | |
# 如果系列目錄不存在,則創建它 | |
if not os.path.exists(series_dir): | |
os.makedirs(series_dir) | |
# 使用清理後的文章標題作為檔案名,保存 JSON 檔案 | |
article_title_sanitized = sanitize_filename(article_details['Article Title']) | |
file_path = os.path.join(series_dir, article_title_sanitized + '.json') | |
with open(file_path, 'w', encoding='utf-8') as f: | |
json.dump(article_details, f, indent=4, ensure_ascii=False) | |
if __name__ == "__main__": | |
# 設定命令行參數解析 | |
parser = argparse.ArgumentParser(description='從指定 URL 提取文章詳細資訊。') | |
parser.add_argument('URL', type=str, help='待抓取文章的 URL。') | |
args = parser.parse_args() | |
details = extract_article_details(args.URL) | |
print(f"成功: {details['Series Name']} - {details['Article Title']} - {details['URL']}") | |
save_to_file(details) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 導入必要的函式庫 | |
import requests # 用於發送網絡請求 | |
from bs4 import BeautifulSoup # 用於解析 HTML 和 XML 文件 | |
import re # 用於正則表達式處理 | |
import argparse # 用於解析命令行參數 | |
import json # 用於處理 JSON 數據 | |
# 定義一個函數用於從特定網頁中獲取文章 | |
def get_articles_from_page(url): | |
# 定義請求頭,模擬瀏覽器用戶代理 | |
request_headers = { | |
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 \ | |
(KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36" | |
} | |
# 向提供的 URL 發送 GET 請求 | |
response = requests.get(url, headers=request_headers) | |
response.raise_for_status() # 如果請求失敗則拋出錯誤 | |
# 使用 BeautifulSoup 解析請求回應的內容 | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# 從網頁中提取文章列表 | |
qa_list_elements = soup.select(".qa-list") | |
articles = [] | |
for item in qa_list_elements: | |
title_link = item.select_one(".qa-list__title-link") | |
day_element = item.select_one(".ir-qa-list__days") | |
# 提取標題、鏈接和日期,並進行清理 | |
title = title_link.text.strip() if title_link else None | |
link = title_link['href'].strip() if title_link else None | |
day = re.sub(' +', ' ', day_element.get_text().strip().replace('\n', ' ').strip()) if day_element else "999" | |
# 確保至少有標題和鏈接 | |
if title and link: | |
articles.append({ | |
'title': title, | |
'link': link, | |
'day': day | |
}) | |
return articles | |
# 定義一個函數用於獲取分頁鏈接 | |
def get_pagination_links(url): | |
# 定義請求頭 | |
request_headers = { | |
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 \ | |
(KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36" | |
} | |
# 向提供的 URL 發送 GET 請求 | |
response = requests.get(url, headers=request_headers) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# 從網頁中提取分頁鏈接 | |
pagination_links = soup.select(".pagination a") | |
pages_links = [link['href'] for link in pagination_links if link.text.isdigit()] | |
return pages_links | |
# 定義一個函數用於從給定的 URL 抓取文章 | |
def scrape_articles_from_url(url): | |
all_articles = [] | |
# 從初始頁面獲取文章 | |
all_articles.extend(get_articles_from_page(url)) | |
# 獲取分頁鏈接並從每個分頁中抓取文章 | |
pagination_links = get_pagination_links(url) | |
for link in pagination_links: | |
all_articles.extend(get_articles_from_page(link)) | |
return all_articles | |
# 當直接運行此腳本時執行以下代碼 | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description='Extract article list from a given URL.') | |
parser.add_argument('URL', type=str, help='The URL of the index page to be scraped.') | |
args = parser.parse_args() | |
articles = scrape_articles_from_url(args.URL) | |
print(json.dumps(articles, indent=4)) # 輸出文章列表的 JSON 格式 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment