Skip to content

Instantly share code, notes, and snippets.

@odessy
Created August 8, 2025 18:23
Show Gist options
  • Save odessy/5f9ad6bb4356fee6b2a36171bfbbd407 to your computer and use it in GitHub Desktop.
Save odessy/5f9ad6bb4356fee6b2a36171bfbbd407 to your computer and use it in GitHub Desktop.
Export document from gitbook web pages
import requests
from bs4 import BeautifulSoup, Tag
import csv
import re
from urllib.parse import urljoin
from typing import List, Dict, Optional
# Configuration
BASE_URL = "https://pipeline.groupthought.com/"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}
OUTPUT_CSV = "articles.csv"
# Function to extract email from text
def extract_email(text: str) -> str:
match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
return match.group(0) if match else ''
def remove_class(tag: Tag):
if 'class' in tag.attrs:
del tag.attrs['class']
# Function to scrape an individual article page
def scrape_article(article_url: str) -> Optional[Dict[str, str]]:
try:
response = requests.get(article_url, headers=HEADERS, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
article_name = soup.find('h1').get_text(strip=True) if soup.find('h1') else ''
#html_content = str(soup.select('main div.grid.whitespace-pre-wrap')) if soup.select('main div.grid.whitespace-pre-wrap') else ''
# Extract <article> content and remove all CSS classes
article_tag = soup.select('main div.grid.whitespace-pre-wrap')
if article_tag:
for result in article_tag:
remove_class(result)
for tag in result.find_all(True): # True matches all tags
remove_class(tag)
html_content = str(article_tag[0])
else:
html_content = ""
breadcrumbs = soup.select('nav[aria-label="Breadcrumb"] a') # Adjust based on HTML
category_path = " > ".join([crumb.get_text(strip=True) for crumb in breadcrumbs])
category_url = urljoin(article_url, breadcrumbs[-1]['href']) if breadcrumbs else ''
author_email = ""#extract_email(soup.get_text())
date_tag = soup.find('time')
date = date_tag['datetime'] if date_tag and date_tag.has_attr('datetime') else ''
published = bool(date)
return {
"article name": article_name,
"html content": html_content,
"category path": category_path,
"category url": category_url,
"article url": article_url,
"author email": author_email,
"date": date,
"published": str(published)
}
except Exception as e:
print(f"[Error] Failed to scrape {article_url}: {e}")
return None
# Function to scrape article links from the main/listing page
def get_article_links(page_url: str) -> List[str]:
links = []
try:
response = requests.get(page_url, headers=HEADERS, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Adjust this selector based on actual site HTML
for a in soup.select("ul.my-2 a.toclink"):
href = a.get("href")
if href:
links.append(urljoin(BASE_URL, href))
except Exception as e:
print(f"[Error] Failed to fetch article list from {page_url}: {e}")
return links
# Main function
def main():
try:
article_links = get_article_links(BASE_URL)
with open(OUTPUT_CSV, mode='w', newline='', encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=[
"id", "article name", "html content", "category path",
"category url", "article url", "author email", "date", "published"
])
writer.writeheader()
for idx, url in enumerate(article_links, start=1):
article_data = scrape_article(url)
if article_data:
article_data["id"] = idx
writer.writerow(article_data)
print(f"[OK] Saved article {idx}: {article_data['article name']}")
except Exception as e:
print(f"[Error] : {e}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment