maxko87 · August 28, 2024 02:43
diff --git a/intercom_help_export.py b/intercom_help_export.py
 #!/usr/bin/env python
 # -*- coding: utf-8-*-

 import json
 import re
 import time

 import requests
 from bs4 import BeautifulSoup
 from markdownify import markdownify as md

 main_url = "https://help.dover.com/en/"


 def fetch_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None


 def parse_collection(collection_url):
    html = fetch_url(collection_url)
    if not html:
        return []

    soup = BeautifulSoup(html, "html.parser")
    articles = []

    for article_link in soup.find_all("a", href=lambda x: x and "/articles/" in x):
        article_url = article_link["href"]
        article_html = fetch_url(article_url)
        if not article_html:
            continue

        article_soup = BeautifulSoup(article_html, "html.parser")
        title = article_soup.find("h1")
        content = article_soup.find("article")

        if title and content:
            articles.append(
                {
                    "title": title.get_text(strip=True),
                    "url": article_url,
                    "content_html": content.prettify(),
                    "content_md": md(str(content), heading_style="ATX"),
                }
            )

    return articles


 def scrape_help_center():
    html = fetch_url(main_url)
    if not html:
        return

    soup = BeautifulSoup(html, "html.parser")
    collections = []

    for card in soup.find_all("a", class_="collection-link"):
        collection_url = card["href"]
        collection_name = card.find("div", class_=lambda x: x and "font-semibold" in x)
        if collection_name:
            collection_name = collection_name.get_text(strip=True)
        else:
            collection_name = "Unnamed Collection"

        print(f"Processing Collection: {collection_name}")
        articles = parse_collection(collection_url)
        collections.append(
            {"name": collection_name, "url": collection_url, "articles": articles}
        )

        time.sleep(1)  # Be nice to the server

    return collections


 def generate_markdown_only(help_center_data):
    markdown_content = ""
    for collection in help_center_data:
        markdown_content += f"# {collection['name']}\n\n"
        for article in collection["articles"]:
            markdown_content += f"## {article['title']}\n\n"
            markdown_content += article["content_md"] + "\n\n"
    return markdown_content


 def generate_ultra_compact(help_center_data):
    compact_content = ""
    for collection in help_center_data:
        compact_content += f"# {collection['name']}\n\n"
        for article in collection["articles"]:
            compact_content += f"## {article['title']}\n\n"
            content = article["content_md"]
            # Remove URLs
            content = re.sub(
                r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
                "",
                content,
            )
            # Remove image references
            content = re.sub(r"!\[.*?\]\(.*?\)", "", content)
            # Remove empty lines
            content = re.sub(r"\n\s*\n", "\n\n", content)
            compact_content += content.strip() + "\n\n"
    return compact_content


 def main():
    help_center_data = scrape_help_center()

    # Save full JSON data
    with open("help_center_content.json", "w", encoding="utf-8") as f:
        json.dump(help_center_data, f, indent=2, ensure_ascii=False)
    print("Full data saved to help_center_content.json")

    # Generate and save Markdown-only content
    markdown_content = generate_markdown_only(help_center_data)
    with open("help_center_content_md_only.md", "w", encoding="utf-8") as f:
        f.write(markdown_content)
    print("Markdown-only content saved to help_center_content_md_only.md")

    # Generate and save ultra-compact content
    ultra_compact_content = generate_ultra_compact(help_center_data)
    with open("help_center_content_ultra_compact.md", "w", encoding="utf-8") as f:
        f.write(ultra_compact_content)
    print("Ultra-compact content saved to help_center_content_ultra_compact.md")


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	# -- coding: utf-8--

	import json
	import re
	import time

	import requests
	from bs4 import BeautifulSoup
	from markdownify import markdownify as md

	main_url = "https://help.dover.com/en/"


	def fetch_url(url):
	try:
	response = requests.get(url)
	response.raise_for_status()
	return response.text
	except requests.RequestException as e:
	print(f"Error fetching {url}: {e}")
	return None


	def parse_collection(collection_url):
	html = fetch_url(collection_url)
	if not html:
	return []

	soup = BeautifulSoup(html, "html.parser")
	articles = []

	for article_link in soup.find_all("a", href=lambda x: x and "/articles/" in x):
	article_url = article_link["href"]
	article_html = fetch_url(article_url)
	if not article_html:
	continue

	article_soup = BeautifulSoup(article_html, "html.parser")
	title = article_soup.find("h1")
	content = article_soup.find("article")

	if title and content:
	articles.append(
	{
	"title": title.get_text(strip=True),
	"url": article_url,
	"content_html": content.prettify(),
	"content_md": md(str(content), heading_style="ATX"),
	}
	)

	return articles


	def scrape_help_center():
	html = fetch_url(main_url)
	if not html:
	return

	soup = BeautifulSoup(html, "html.parser")
	collections = []

	for card in soup.find_all("a", class_="collection-link"):
	collection_url = card["href"]
	collection_name = card.find("div", class_=lambda x: x and "font-semibold" in x)
	if collection_name:
	collection_name = collection_name.get_text(strip=True)
	else:
	collection_name = "Unnamed Collection"

	print(f"Processing Collection: {collection_name}")
	articles = parse_collection(collection_url)
	collections.append(
	{"name": collection_name, "url": collection_url, "articles": articles}
	)

	time.sleep(1) # Be nice to the server

	return collections


	def generate_markdown_only(help_center_data):
	markdown_content = ""
	for collection in help_center_data:
	markdown_content += f"# {collection['name']}\n\n"
	for article in collection["articles"]:
	markdown_content += f"## {article['title']}\n\n"
	markdown_content += article["content_md"] + "\n\n"
	return markdown_content


	def generate_ultra_compact(help_center_data):
	compact_content = ""
	for collection in help_center_data:
	compact_content += f"# {collection['name']}\n\n"
	for article in collection["articles"]:
	compact_content += f"## {article['title']}\n\n"
	content = article["content_md"]
	# Remove URLs
	content = re.sub(
	r"http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\\(\\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
	"",
	content,
	)
	# Remove image references
	content = re.sub(r"!\[.?\]\(.?\)", "", content)
	# Remove empty lines
	content = re.sub(r"\n\s*\n", "\n\n", content)
	compact_content += content.strip() + "\n\n"
	return compact_content


	def main():
	help_center_data = scrape_help_center()

	# Save full JSON data
	with open("help_center_content.json", "w", encoding="utf-8") as f:
	json.dump(help_center_data, f, indent=2, ensure_ascii=False)
	print("Full data saved to help_center_content.json")

	# Generate and save Markdown-only content
	markdown_content = generate_markdown_only(help_center_data)
	with open("help_center_content_md_only.md", "w", encoding="utf-8") as f:
	f.write(markdown_content)
	print("Markdown-only content saved to help_center_content_md_only.md")

	# Generate and save ultra-compact content
	ultra_compact_content = generate_ultra_compact(help_center_data)
	with open("help_center_content_ultra_compact.md", "w", encoding="utf-8") as f:
	f.write(ultra_compact_content)
	print("Ultra-compact content saved to help_center_content_ultra_compact.md")


	if __name__ == "__main__":
	main()