Created
August 28, 2024 02:43
-
-
Save maxko87/36742a49b997d228c5eb1b128a069418 to your computer and use it in GitHub Desktop.
Scrape intercom help center
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8-*- | |
import json | |
import re | |
import time | |
import requests | |
from bs4 import BeautifulSoup | |
from markdownify import markdownify as md | |
main_url = "https://help.dover.com/en/" | |
def fetch_url(url): | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
return response.text | |
except requests.RequestException as e: | |
print(f"Error fetching {url}: {e}") | |
return None | |
def parse_collection(collection_url): | |
html = fetch_url(collection_url) | |
if not html: | |
return [] | |
soup = BeautifulSoup(html, "html.parser") | |
articles = [] | |
for article_link in soup.find_all("a", href=lambda x: x and "/articles/" in x): | |
article_url = article_link["href"] | |
article_html = fetch_url(article_url) | |
if not article_html: | |
continue | |
article_soup = BeautifulSoup(article_html, "html.parser") | |
title = article_soup.find("h1") | |
content = article_soup.find("article") | |
if title and content: | |
articles.append( | |
{ | |
"title": title.get_text(strip=True), | |
"url": article_url, | |
"content_html": content.prettify(), | |
"content_md": md(str(content), heading_style="ATX"), | |
} | |
) | |
return articles | |
def scrape_help_center(): | |
html = fetch_url(main_url) | |
if not html: | |
return | |
soup = BeautifulSoup(html, "html.parser") | |
collections = [] | |
for card in soup.find_all("a", class_="collection-link"): | |
collection_url = card["href"] | |
collection_name = card.find("div", class_=lambda x: x and "font-semibold" in x) | |
if collection_name: | |
collection_name = collection_name.get_text(strip=True) | |
else: | |
collection_name = "Unnamed Collection" | |
print(f"Processing Collection: {collection_name}") | |
articles = parse_collection(collection_url) | |
collections.append( | |
{"name": collection_name, "url": collection_url, "articles": articles} | |
) | |
time.sleep(1) # Be nice to the server | |
return collections | |
def generate_markdown_only(help_center_data): | |
markdown_content = "" | |
for collection in help_center_data: | |
markdown_content += f"# {collection['name']}\n\n" | |
for article in collection["articles"]: | |
markdown_content += f"## {article['title']}\n\n" | |
markdown_content += article["content_md"] + "\n\n" | |
return markdown_content | |
def generate_ultra_compact(help_center_data): | |
compact_content = "" | |
for collection in help_center_data: | |
compact_content += f"# {collection['name']}\n\n" | |
for article in collection["articles"]: | |
compact_content += f"## {article['title']}\n\n" | |
content = article["content_md"] | |
# Remove URLs | |
content = re.sub( | |
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", | |
"", | |
content, | |
) | |
# Remove image references | |
content = re.sub(r"!\[.*?\]\(.*?\)", "", content) | |
# Remove empty lines | |
content = re.sub(r"\n\s*\n", "\n\n", content) | |
compact_content += content.strip() + "\n\n" | |
return compact_content | |
def main(): | |
help_center_data = scrape_help_center() | |
# Save full JSON data | |
with open("help_center_content.json", "w", encoding="utf-8") as f: | |
json.dump(help_center_data, f, indent=2, ensure_ascii=False) | |
print("Full data saved to help_center_content.json") | |
# Generate and save Markdown-only content | |
markdown_content = generate_markdown_only(help_center_data) | |
with open("help_center_content_md_only.md", "w", encoding="utf-8") as f: | |
f.write(markdown_content) | |
print("Markdown-only content saved to help_center_content_md_only.md") | |
# Generate and save ultra-compact content | |
ultra_compact_content = generate_ultra_compact(help_center_data) | |
with open("help_center_content_ultra_compact.md", "w", encoding="utf-8") as f: | |
f.write(ultra_compact_content) | |
print("Ultra-compact content saved to help_center_content_ultra_compact.md") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment