odessy · August 8, 2025 18:23
diff --git a/gitbook_csv_export.py b/gitbook_csv_export.py
 import requests
 from bs4 import BeautifulSoup, Tag
 import csv
 import re
 from urllib.parse import urljoin
 from typing import List, Dict, Optional

 # Configuration
 BASE_URL = "https://pipeline.groupthought.com/"
 HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
 }
 OUTPUT_CSV = "articles.csv"

 # Function to extract email from text
 def extract_email(text: str) -> str:
    match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    return match.group(0) if match else ''

 def remove_class(tag: Tag):
    if 'class' in tag.attrs:
        del tag.attrs['class']

 # Function to scrape an individual article page
 def scrape_article(article_url: str) -> Optional[Dict[str, str]]:
    try:
        response = requests.get(article_url, headers=HEADERS, timeout=30)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        article_name = soup.find('h1').get_text(strip=True) if soup.find('h1') else ''
        
        #html_content = str(soup.select('main div.grid.whitespace-pre-wrap')) if soup.select('main div.grid.whitespace-pre-wrap') else ''

        # Extract <article> content and remove all CSS classes
        article_tag = soup.select('main div.grid.whitespace-pre-wrap')
        if article_tag:
            for result in article_tag:
                remove_class(result)
                for tag in result.find_all(True):  # True matches all tags
                    remove_class(tag)
            html_content = str(article_tag[0])
        else:
            html_content = ""       

        breadcrumbs = soup.select('nav[aria-label="Breadcrumb"] a')  # Adjust based on HTML
        category_path = " > ".join([crumb.get_text(strip=True) for crumb in breadcrumbs])
        category_url = urljoin(article_url, breadcrumbs[-1]['href']) if breadcrumbs else ''

        author_email = ""#extract_email(soup.get_text())

        date_tag = soup.find('time')
        date = date_tag['datetime'] if date_tag and date_tag.has_attr('datetime') else ''
        published = bool(date)

        return {
            "article name": article_name,
            "html content": html_content,
            "category path": category_path,
            "category url": category_url,
            "article url": article_url,
            "author email": author_email,
            "date": date,
            "published": str(published)
        }

    except Exception as e:
        print(f"[Error] Failed to scrape {article_url}: {e}")
        return None

 # Function to scrape article links from the main/listing page
 def get_article_links(page_url: str) -> List[str]:
    links = []
    try:
        response = requests.get(page_url, headers=HEADERS, timeout=30)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Adjust this selector based on actual site HTML
        for a in soup.select("ul.my-2 a.toclink"):
            href = a.get("href")
            if href:
                links.append(urljoin(BASE_URL, href))

    except Exception as e:
        print(f"[Error] Failed to fetch article list from {page_url}: {e}")
    
    return links

 # Main function
 def main():
    try:
        article_links = get_article_links(BASE_URL)

        with open(OUTPUT_CSV, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=[
                "id", "article name", "html content", "category path",
                "category url", "article url", "author email", "date", "published"
            ])
            writer.writeheader()

            for idx, url in enumerate(article_links, start=1):
                article_data = scrape_article(url)
                if article_data:
                    article_data["id"] = idx
                    writer.writerow(article_data)
                    print(f"[OK] Saved article {idx}: {article_data['article name']}")

    except Exception as e:
        print(f"[Error] : {e}")                

 if __name__ == "__main__":
    main()
	import requests
	from bs4 import BeautifulSoup, Tag
	import csv
	import re
	from urllib.parse import urljoin
	from typing import List, Dict, Optional

	# Configuration
	BASE_URL = "https://pipeline.groupthought.com/"
	HEADERS = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
	}
	OUTPUT_CSV = "articles.csv"

	# Function to extract email from text
	def extract_email(text: str) -> str:
	match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
	return match.group(0) if match else ''

	def remove_class(tag: Tag):
	if 'class' in tag.attrs:
	del tag.attrs['class']

	# Function to scrape an individual article page
	def scrape_article(article_url: str) -> Optional[Dict[str, str]]:
	try:
	response = requests.get(article_url, headers=HEADERS, timeout=30)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, "html.parser")

	article_name = soup.find('h1').get_text(strip=True) if soup.find('h1') else ''

	#html_content = str(soup.select('main div.grid.whitespace-pre-wrap')) if soup.select('main div.grid.whitespace-pre-wrap') else ''

	# Extract <article> content and remove all CSS classes
	article_tag = soup.select('main div.grid.whitespace-pre-wrap')
	if article_tag:
	for result in article_tag:
	remove_class(result)
	for tag in result.find_all(True): # True matches all tags
	remove_class(tag)
	html_content = str(article_tag[0])
	else:
	html_content = ""

	breadcrumbs = soup.select('nav[aria-label="Breadcrumb"] a') # Adjust based on HTML
	category_path = " > ".join([crumb.get_text(strip=True) for crumb in breadcrumbs])
	category_url = urljoin(article_url, breadcrumbs[-1]['href']) if breadcrumbs else ''

	author_email = ""#extract_email(soup.get_text())

	date_tag = soup.find('time')
	date = date_tag['datetime'] if date_tag and date_tag.has_attr('datetime') else ''
	published = bool(date)

	return {
	"article name": article_name,
	"html content": html_content,
	"category path": category_path,
	"category url": category_url,
	"article url": article_url,
	"author email": author_email,
	"date": date,
	"published": str(published)
	}

	except Exception as e:
	print(f"[Error] Failed to scrape {article_url}: {e}")
	return None

	# Function to scrape article links from the main/listing page
	def get_article_links(page_url: str) -> List[str]:
	links = []
	try:
	response = requests.get(page_url, headers=HEADERS, timeout=30)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, "html.parser")

	# Adjust this selector based on actual site HTML
	for a in soup.select("ul.my-2 a.toclink"):
	href = a.get("href")
	if href:
	links.append(urljoin(BASE_URL, href))

	except Exception as e:
	print(f"[Error] Failed to fetch article list from {page_url}: {e}")

	return links

	# Main function
	def main():
	try:
	article_links = get_article_links(BASE_URL)

	with open(OUTPUT_CSV, mode='w', newline='', encoding='utf-8') as file:
	writer = csv.DictWriter(file, fieldnames=[
	"id", "article name", "html content", "category path",
	"category url", "article url", "author email", "date", "published"
	])
	writer.writeheader()

	for idx, url in enumerate(article_links, start=1):
	article_data = scrape_article(url)
	if article_data:
	article_data["id"] = idx
	writer.writerow(article_data)
	print(f"[OK] Saved article {idx}: {article_data['article name']}")

	except Exception as e:
	print(f"[Error] : {e}")

	if __name__ == "__main__":
	main()