drmingdrmer · December 9, 2024 05:31
diff --git a/dl-wiki.py b/dl-wiki.py
 #!/usr/bin/env python3
 # coding: utf-8


 # Requirements:
 #  pip3 install bs4 markdownify requests
 # Usage:
 #  dl-wiki.py https://en.wikipedia.org/wiki/Formal_derivative

 import sys
 import os
 import re
 import requests
 from urllib.parse import quote
 from urllib.parse import unquote
 from bs4 import BeautifulSoup
 from bs4 import Comment
 from markdownify import markdownify

 def get_wiki_content(parsed_url):
    url = "https://{lang}.wikipedia.org/w/api.php?format=json&action=parse&format=json&prop=displaytitle|text&variant=zh-hans&page={escaped_title}".format(**parsed_url)

    # requests will automatically use proxy settings from environment variables
    # including http_proxy, https_proxy, all_proxy
    response = requests.get(url, timeout=30)
    response.raise_for_status()  # Check response status
    return response.json()

 def parse_url(url: str):
    g = re.search(r'https://(.*?).wikipedia.org/wiki/(.*)', url)
    if g:
        lang = g.group(1)
        escaped_title = g.group(2)
        title = unquote(escaped_title)
    else:
        raise ValueError("invalid wikipedia url: " + url)

    return {"lang": lang,
            "title": title,
            "escaped_title": escaped_title,
    }

 def convert_math_to_latex(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Remove HTML comments
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    for comment in comments:
        comment.extract()

    # Remove span tags with display: none
    hidden_spans = soup.find_all('span', style=lambda value: value and 'display: none' in value)
    for span in hidden_spans:
        span.decompose()

    # Remove <style>
    navboxes = soup.find_all('style')
    for navbox in navboxes:
        navbox.decompose()

    # Remove reference such as ¹
    #  <sup class="reference" id="cite_ref-1">
    navboxes = soup.find_all('sup', class_='reference')
    for navbox in navboxes:
        navbox.decompose()

    # Remove navbox divs
    navboxes = soup.find_all('div', class_='navbox')
    for navbox in navboxes:
        navbox.decompose()

    navboxes = soup.find_all('div', class_='navbox-styles')
    for navbox in navboxes:
        navbox.decompose()

    # Remove edit buttons
    edit_links = soup.find_all('span', class_='mw-editsection')
    for navbox in edit_links:
        navbox.decompose()

    # Process texhtml span tags
    texhtml_spans = soup.find_all('span', class_='texhtml')
    for span in texhtml_spans:
        text = span.get_text().strip()
        span.replace_with(f'`{text}`')


    # Remove i tags
    i_spans = soup.find_all('i')
    for span in i_spans:
        text = span.get_text().strip()
        span.replace_with(f'{text}')

    # Replace internal links
    links = soup.find_all('a')
    for link in links:
        href = link.get('href', '')
        # Check if it's an internal link (starts with /wiki/)
        # Non-existent page links: /w/index.php?***
        if href.startswith('/wiki/') or href.startswith('/w/index.php'):
            # Get link text
            link_text = link.get_text()
            # Replace link with [[text]] format
            link.replace_with(f'[[{link_text}]]')

    # Find all img tags
    img_elements = soup.find_all('img')

    # Replace each img tag
    for img in img_elements:
        alttext = img.get('alt', '')
        # Extract LaTeX expression using escaped braces
        latex = re.search(r'\{\\displaystyle (.*)\}', alttext)
        if latex:
            # Surround LaTeX expression with $
            text = latex.group(1).strip()
            latex_text = f'${text}$'
            # Replace original img tag
            img.replace_with(latex_text)

    # After converting to latex within `$`:
    # <span class="mwe-math-element"> $R$ </span>
    texhtml_spans = soup.find_all('span', class_='mwe-math-element')
    for span in texhtml_spans:
        text = span.get_text().strip()
        span.replace_with(f'{text}')

    # <span class="serif"> `L` </span>
    texhtml_spans = soup.find_all('span', class_='serif')
    for span in texhtml_spans:
        text = span.get_text().strip()
        span.replace_with(f'{text}')

    #  <link href="mw-data:TemplateStyles:r58896141" rel="mw-deduplicated-inline-style"/>
    texhtml_spans = soup.find_all('link')
    for span in texhtml_spans:
        span.decompose()

    html_content_2 = soup.prettify()
    soup = BeautifulSoup(html_content_2, 'html.parser')

    # Custom formatter function
    def formatter(text, **kwargs):
        if text:
            # Remove newlines from text while preserving leading/trailing whitespace
            return text.strip('\n')
        return text

    # Return converted text using custom formatter
    return soup.prettify(formatter=formatter)

 def html_to_markdown2(html_content):
    markdown = markdownify(html_content,
                         heading_style="ATX",
                         bullets="-",
                         strip=['script', 'style'],
                         code_language="python",
                         escape_underscores=False,
    )

    markdown = re.sub(r'\n\s*\n', '\n\n', markdown, flags=re.UNICODE)
    markdown = re.sub(r'(?<!\n)\n(?!\n)', ' ', markdown, flags=re.UNICODE)
    markdown = re.sub(r' +', ' ', markdown)

    return markdown

 def download_wiki_page(url: str):
    unescaped_url = unquote(url)
    parsed = parse_url(url)

    j = get_wiki_content(parsed)
    display_title = j["parse"]["title"]
    html_text = j["parse"]["text"]["*"]

    output_fn = parsed["title"]

    html_latex_text = convert_math_to_latex(html_text)

    md_text = html_to_markdown2(html_latex_text)

    with open(output_fn + '.md', 'w') as f:
        f.write(unescaped_url)
        f.write("\n\n")
        f.write("# {}".format(display_title))
        f.write("\n")
        f.write(md_text)

    return output_fn

 if __name__ == "__main__":
    escaped_url = sys.argv[1]
    output_fn = download_wiki_page(escaped_url)
    print(output_fn + '.md')
	#!/usr/bin/env python3
	# coding: utf-8


	# Requirements:
	# pip3 install bs4 markdownify requests
	# Usage:
	# dl-wiki.py https://en.wikipedia.org/wiki/Formal_derivative

	import sys
	import os
	import re
	import requests
	from urllib.parse import quote
	from urllib.parse import unquote
	from bs4 import BeautifulSoup
	from bs4 import Comment
	from markdownify import markdownify

	def get_wiki_content(parsed_url):
	url = "https://{lang}.wikipedia.org/w/api.php?format=json&action=parse&format=json&prop=displaytitle\|text&variant=zh-hans&page={escaped_title}".format(**parsed_url)

	# requests will automatically use proxy settings from environment variables
	# including http_proxy, https_proxy, all_proxy
	response = requests.get(url, timeout=30)
	response.raise_for_status() # Check response status
	return response.json()

	def parse_url(url: str):
	g = re.search(r'https://(.?).wikipedia.org/wiki/(.)', url)
	if g:
	lang = g.group(1)
	escaped_title = g.group(2)
	title = unquote(escaped_title)
	else:
	raise ValueError("invalid wikipedia url: " + url)

	return {"lang": lang,
	"title": title,
	"escaped_title": escaped_title,
	}

	def convert_math_to_latex(html_content):
	soup = BeautifulSoup(html_content, 'html.parser')

	# Remove HTML comments
	comments = soup.find_all(string=lambda text: isinstance(text, Comment))
	for comment in comments:
	comment.extract()

	# Remove span tags with display: none
	hidden_spans = soup.find_all('span', style=lambda value: value and 'display: none' in value)
	for span in hidden_spans:
	span.decompose()

	# Remove <style>
	navboxes = soup.find_all('style')
	for navbox in navboxes:
	navbox.decompose()

	# Remove reference such as ¹
	# <sup class="reference" id="cite_ref-1">
	navboxes = soup.find_all('sup', class_='reference')
	for navbox in navboxes:
	navbox.decompose()

	# Remove navbox divs
	navboxes = soup.find_all('div', class_='navbox')
	for navbox in navboxes:
	navbox.decompose()

	navboxes = soup.find_all('div', class_='navbox-styles')
	for navbox in navboxes:
	navbox.decompose()

	# Remove edit buttons
	edit_links = soup.find_all('span', class_='mw-editsection')
	for navbox in edit_links:
	navbox.decompose()

	# Process texhtml span tags
	texhtml_spans = soup.find_all('span', class_='texhtml')
	for span in texhtml_spans:
	text = span.get_text().strip()
	span.replace_with(f'`{text}`')


	# Remove i tags
	i_spans = soup.find_all('i')
	for span in i_spans:
	text = span.get_text().strip()
	span.replace_with(f'{text}')

	# Replace internal links
	links = soup.find_all('a')
	for link in links:
	href = link.get('href', '')
	# Check if it's an internal link (starts with /wiki/)
	# Non-existent page links: /w/index.php?***
	if href.startswith('/wiki/') or href.startswith('/w/index.php'):
	# Get link text
	link_text = link.get_text()
	# Replace link with [[text]] format
	link.replace_with(f'[[{link_text}]]')

	# Find all img tags
	img_elements = soup.find_all('img')

	# Replace each img tag
	for img in img_elements:
	alttext = img.get('alt', '')
	# Extract LaTeX expression using escaped braces
	latex = re.search(r'\{\\displaystyle (.*)\}', alttext)
	if latex:
	# Surround LaTeX expression with $
	text = latex.group(1).strip()
	latex_text = f'${text}$'
	# Replace original img tag
	img.replace_with(latex_text)

	# After converting to latex within `$`:
	# <span class="mwe-math-element"> $R$ </span>
	texhtml_spans = soup.find_all('span', class_='mwe-math-element')
	for span in texhtml_spans:
	text = span.get_text().strip()
	span.replace_with(f'{text}')

	# <span class="serif"> `L` </span>
	texhtml_spans = soup.find_all('span', class_='serif')
	for span in texhtml_spans:
	text = span.get_text().strip()
	span.replace_with(f'{text}')

	# <link href="mw-data:TemplateStyles:r58896141" rel="mw-deduplicated-inline-style"/>
	texhtml_spans = soup.find_all('link')
	for span in texhtml_spans:
	span.decompose()

	html_content_2 = soup.prettify()
	soup = BeautifulSoup(html_content_2, 'html.parser')

	# Custom formatter function
	def formatter(text, **kwargs):
	if text:
	# Remove newlines from text while preserving leading/trailing whitespace
	return text.strip('\n')
	return text

	# Return converted text using custom formatter
	return soup.prettify(formatter=formatter)

	def html_to_markdown2(html_content):
	markdown = markdownify(html_content,
	heading_style="ATX",
	bullets="-",
	strip=['script', 'style'],
	code_language="python",
	escape_underscores=False,
	)

	markdown = re.sub(r'\n\s*\n', '\n\n', markdown, flags=re.UNICODE)
	markdown = re.sub(r'(?<!\n)\n(?!\n)', ' ', markdown, flags=re.UNICODE)
	markdown = re.sub(r' +', ' ', markdown)

	return markdown

	def download_wiki_page(url: str):
	unescaped_url = unquote(url)
	parsed = parse_url(url)

	j = get_wiki_content(parsed)
	display_title = j["parse"]["title"]
	html_text = j["parse"]["text"]["*"]

	output_fn = parsed["title"]

	html_latex_text = convert_math_to_latex(html_text)

	md_text = html_to_markdown2(html_latex_text)

	with open(output_fn + '.md', 'w') as f:
	f.write(unescaped_url)
	f.write("\n\n")
	f.write("# {}".format(display_title))
	f.write("\n")
	f.write(md_text)

	return output_fn

	if __name__ == "__main__":
	escaped_url = sys.argv[1]
	output_fn = download_wiki_page(escaped_url)
	print(output_fn + '.md')