Created
August 8, 2025 18:23
-
-
Save odessy/5f9ad6bb4356fee6b2a36171bfbbd407 to your computer and use it in GitHub Desktop.
Export document from gitbook web pages
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from bs4 import BeautifulSoup, Tag | |
| import csv | |
| import re | |
| from urllib.parse import urljoin | |
| from typing import List, Dict, Optional | |
| # Configuration | |
| BASE_URL = "https://pipeline.groupthought.com/" | |
| HEADERS = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)" | |
| } | |
| OUTPUT_CSV = "articles.csv" | |
| # Function to extract email from text | |
| def extract_email(text: str) -> str: | |
| match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text) | |
| return match.group(0) if match else '' | |
| def remove_class(tag: Tag): | |
| if 'class' in tag.attrs: | |
| del tag.attrs['class'] | |
| # Function to scrape an individual article page | |
| def scrape_article(article_url: str) -> Optional[Dict[str, str]]: | |
| try: | |
| response = requests.get(article_url, headers=HEADERS, timeout=30) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| article_name = soup.find('h1').get_text(strip=True) if soup.find('h1') else '' | |
| #html_content = str(soup.select('main div.grid.whitespace-pre-wrap')) if soup.select('main div.grid.whitespace-pre-wrap') else '' | |
| # Extract <article> content and remove all CSS classes | |
| article_tag = soup.select('main div.grid.whitespace-pre-wrap') | |
| if article_tag: | |
| for result in article_tag: | |
| remove_class(result) | |
| for tag in result.find_all(True): # True matches all tags | |
| remove_class(tag) | |
| html_content = str(article_tag[0]) | |
| else: | |
| html_content = "" | |
| breadcrumbs = soup.select('nav[aria-label="Breadcrumb"] a') # Adjust based on HTML | |
| category_path = " > ".join([crumb.get_text(strip=True) for crumb in breadcrumbs]) | |
| category_url = urljoin(article_url, breadcrumbs[-1]['href']) if breadcrumbs else '' | |
| author_email = ""#extract_email(soup.get_text()) | |
| date_tag = soup.find('time') | |
| date = date_tag['datetime'] if date_tag and date_tag.has_attr('datetime') else '' | |
| published = bool(date) | |
| return { | |
| "article name": article_name, | |
| "html content": html_content, | |
| "category path": category_path, | |
| "category url": category_url, | |
| "article url": article_url, | |
| "author email": author_email, | |
| "date": date, | |
| "published": str(published) | |
| } | |
| except Exception as e: | |
| print(f"[Error] Failed to scrape {article_url}: {e}") | |
| return None | |
| # Function to scrape article links from the main/listing page | |
| def get_article_links(page_url: str) -> List[str]: | |
| links = [] | |
| try: | |
| response = requests.get(page_url, headers=HEADERS, timeout=30) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| # Adjust this selector based on actual site HTML | |
| for a in soup.select("ul.my-2 a.toclink"): | |
| href = a.get("href") | |
| if href: | |
| links.append(urljoin(BASE_URL, href)) | |
| except Exception as e: | |
| print(f"[Error] Failed to fetch article list from {page_url}: {e}") | |
| return links | |
| # Main function | |
| def main(): | |
| try: | |
| article_links = get_article_links(BASE_URL) | |
| with open(OUTPUT_CSV, mode='w', newline='', encoding='utf-8') as file: | |
| writer = csv.DictWriter(file, fieldnames=[ | |
| "id", "article name", "html content", "category path", | |
| "category url", "article url", "author email", "date", "published" | |
| ]) | |
| writer.writeheader() | |
| for idx, url in enumerate(article_links, start=1): | |
| article_data = scrape_article(url) | |
| if article_data: | |
| article_data["id"] = idx | |
| writer.writerow(article_data) | |
| print(f"[OK] Saved article {idx}: {article_data['article name']}") | |
| except Exception as e: | |
| print(f"[Error] : {e}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment