Skip to content

Instantly share code, notes, and snippets.

@maxko87
Created August 28, 2024 02:43
Show Gist options
  • Save maxko87/36742a49b997d228c5eb1b128a069418 to your computer and use it in GitHub Desktop.
Save maxko87/36742a49b997d228c5eb1b128a069418 to your computer and use it in GitHub Desktop.
Scrape intercom help center
#!/usr/bin/env python
# -*- coding: utf-8-*-
import json
import re
import time
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
main_url = "https://help.dover.com/en/"
def fetch_url(url):
try:
response = requests.get(url)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
def parse_collection(collection_url):
html = fetch_url(collection_url)
if not html:
return []
soup = BeautifulSoup(html, "html.parser")
articles = []
for article_link in soup.find_all("a", href=lambda x: x and "/articles/" in x):
article_url = article_link["href"]
article_html = fetch_url(article_url)
if not article_html:
continue
article_soup = BeautifulSoup(article_html, "html.parser")
title = article_soup.find("h1")
content = article_soup.find("article")
if title and content:
articles.append(
{
"title": title.get_text(strip=True),
"url": article_url,
"content_html": content.prettify(),
"content_md": md(str(content), heading_style="ATX"),
}
)
return articles
def scrape_help_center():
html = fetch_url(main_url)
if not html:
return
soup = BeautifulSoup(html, "html.parser")
collections = []
for card in soup.find_all("a", class_="collection-link"):
collection_url = card["href"]
collection_name = card.find("div", class_=lambda x: x and "font-semibold" in x)
if collection_name:
collection_name = collection_name.get_text(strip=True)
else:
collection_name = "Unnamed Collection"
print(f"Processing Collection: {collection_name}")
articles = parse_collection(collection_url)
collections.append(
{"name": collection_name, "url": collection_url, "articles": articles}
)
time.sleep(1) # Be nice to the server
return collections
def generate_markdown_only(help_center_data):
markdown_content = ""
for collection in help_center_data:
markdown_content += f"# {collection['name']}\n\n"
for article in collection["articles"]:
markdown_content += f"## {article['title']}\n\n"
markdown_content += article["content_md"] + "\n\n"
return markdown_content
def generate_ultra_compact(help_center_data):
compact_content = ""
for collection in help_center_data:
compact_content += f"# {collection['name']}\n\n"
for article in collection["articles"]:
compact_content += f"## {article['title']}\n\n"
content = article["content_md"]
# Remove URLs
content = re.sub(
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
"",
content,
)
# Remove image references
content = re.sub(r"!\[.*?\]\(.*?\)", "", content)
# Remove empty lines
content = re.sub(r"\n\s*\n", "\n\n", content)
compact_content += content.strip() + "\n\n"
return compact_content
def main():
help_center_data = scrape_help_center()
# Save full JSON data
with open("help_center_content.json", "w", encoding="utf-8") as f:
json.dump(help_center_data, f, indent=2, ensure_ascii=False)
print("Full data saved to help_center_content.json")
# Generate and save Markdown-only content
markdown_content = generate_markdown_only(help_center_data)
with open("help_center_content_md_only.md", "w", encoding="utf-8") as f:
f.write(markdown_content)
print("Markdown-only content saved to help_center_content_md_only.md")
# Generate and save ultra-compact content
ultra_compact_content = generate_ultra_compact(help_center_data)
with open("help_center_content_ultra_compact.md", "w", encoding="utf-8") as f:
f.write(ultra_compact_content)
print("Ultra-compact content saved to help_center_content_ultra_compact.md")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment