Skip to content

Instantly share code, notes, and snippets.

@hlb
Created December 3, 2023 10:34
Show Gist options
  • Save hlb/cc30b3434e70a6708cb9dd5cccdf9277 to your computer and use it in GitHub Desktop.
Save hlb/cc30b3434e70a6708cb9dd5cccdf9277 to your computer and use it in GitHub Desktop.
iThome 爬蟲程式
# 匯入所需的模組
from requests.exceptions import HTTPError
import requests
from bs4 import BeautifulSoup
import argparse
import json
import os
def extract_article_details(url):
"""
從文章 URL 中提取詳細資訊。
Args:
url (str): 文章的 URL。
Returns:
dict: 包含文章詳細資訊的字典。
"""
# 定義 HTTP Request Header
request_headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
}
# 建立一個 Session 物件
session = requests.Session()
try:
# 使用 Session 物件發送 GET 請求到指定 URL
response = session.get(url, headers=request_headers)
response.raise_for_status() # 如果請求失敗則引發錯誤
except requests.exceptions.HTTPError as e:
print(f"發送請求時出錯: {e}")
return None
# 使用 BeautifulSoup 解析響應內容
soup = BeautifulSoup(response.content, 'html.parser')
# 從解析後的內容中提取詳細資訊
content = soup.select_one(".qa-panel__content")
# 提取各個部分的資訊
title_selector = content.select_one(".qa-header__title")
title = title_selector.get_text(strip=True) if title_selector else None
series_name_selector = content.select_one(".ir-article__topic a")
series_name = series_name_selector.get_text(strip=True) if series_name_selector else None
author_name_selector = content.select_one(".ir-article-info__name")
author_name = author_name_selector.get_text(strip=True) if author_name_selector else None
day_number_selector = content.select_one(".ir-article__topic-count")
day_number = day_number_selector.get_text(strip=True) if day_number_selector else None
if day_number is not None:
try:
day_number = int(day_number)
except ValueError:
day_number = None
# 提取主要文章內容
main_content_div = content.select_one(".qa-markdown")
if main_content_div:
main_content = "\n".join([p.get_text(strip=True) for p in main_content_div.find_all(['p', 'h2', 'h3', 'li', 'pre'])])
else:
main_content = None
# 將詳細資訊整合到一個字典中
article_details = {
"Article Title": title,
"Article Content": main_content,
"Series Name": series_name,
"Author Name": author_name,
"Day": day_number,
"URL": url
}
return article_details
def sanitize_filename(filename):
"""
通過移除或替換不允許在檔案或目錄名中的字符來清理檔案名。
Args:
filename (str): 原始檔案名。
Returns:
str: 清理後的檔案名。
"""
# 定義不允許在檔案名中的字符
disallowed_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
for char in disallowed_chars:
filename = filename.replace(char, "_") # 將不允許的字符替換為下劃線
return filename
def save_to_file(article_details):
# 如果 'data' 目錄不存在,則創建它
if not os.path.exists('data'):
os.makedirs('data')
# 使用清理後的系列名稱作為子目錄名
series_name_sanitized = sanitize_filename(article_details['Series Name'])
series_dir = os.path.join('data', series_name_sanitized)
# 如果系列目錄不存在,則創建它
if not os.path.exists(series_dir):
os.makedirs(series_dir)
# 使用清理後的文章標題作為檔案名,保存 JSON 檔案
article_title_sanitized = sanitize_filename(article_details['Article Title'])
file_path = os.path.join(series_dir, article_title_sanitized + '.json')
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(article_details, f, indent=4, ensure_ascii=False)
if __name__ == "__main__":
# 設定命令行參數解析
parser = argparse.ArgumentParser(description='從指定 URL 提取文章詳細資訊。')
parser.add_argument('URL', type=str, help='待抓取文章的 URL。')
args = parser.parse_args()
details = extract_article_details(args.URL)
print(f"成功: {details['Series Name']} - {details['Article Title']} - {details['URL']}")
save_to_file(details)
# 導入必要的函式庫
import requests # 用於發送網絡請求
from bs4 import BeautifulSoup # 用於解析 HTML 和 XML 文件
import re # 用於正則表達式處理
import argparse # 用於解析命令行參數
import json # 用於處理 JSON 數據
# 定義一個函數用於從特定網頁中獲取文章
def get_articles_from_page(url):
# 定義請求頭,模擬瀏覽器用戶代理
request_headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
}
# 向提供的 URL 發送 GET 請求
response = requests.get(url, headers=request_headers)
response.raise_for_status() # 如果請求失敗則拋出錯誤
# 使用 BeautifulSoup 解析請求回應的內容
soup = BeautifulSoup(response.content, 'html.parser')
# 從網頁中提取文章列表
qa_list_elements = soup.select(".qa-list")
articles = []
for item in qa_list_elements:
title_link = item.select_one(".qa-list__title-link")
day_element = item.select_one(".ir-qa-list__days")
# 提取標題、鏈接和日期,並進行清理
title = title_link.text.strip() if title_link else None
link = title_link['href'].strip() if title_link else None
day = re.sub(' +', ' ', day_element.get_text().strip().replace('\n', ' ').strip()) if day_element else "999"
# 確保至少有標題和鏈接
if title and link:
articles.append({
'title': title,
'link': link,
'day': day
})
return articles
# 定義一個函數用於獲取分頁鏈接
def get_pagination_links(url):
# 定義請求頭
request_headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
}
# 向提供的 URL 發送 GET 請求
response = requests.get(url, headers=request_headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# 從網頁中提取分頁鏈接
pagination_links = soup.select(".pagination a")
pages_links = [link['href'] for link in pagination_links if link.text.isdigit()]
return pages_links
# 定義一個函數用於從給定的 URL 抓取文章
def scrape_articles_from_url(url):
all_articles = []
# 從初始頁面獲取文章
all_articles.extend(get_articles_from_page(url))
# 獲取分頁鏈接並從每個分頁中抓取文章
pagination_links = get_pagination_links(url)
for link in pagination_links:
all_articles.extend(get_articles_from_page(link))
return all_articles
# 當直接運行此腳本時執行以下代碼
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Extract article list from a given URL.')
parser.add_argument('URL', type=str, help='The URL of the index page to be scraped.')
args = parser.parse_args()
articles = scrape_articles_from_url(args.URL)
print(json.dumps(articles, indent=4)) # 輸出文章列表的 JSON 格式
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment