Skip to content

Instantly share code, notes, and snippets.

@EvilFreelancer
Last active November 23, 2024 12:17
Show Gist options
  • Save EvilFreelancer/bef91f8c03452cee5799640580297d77 to your computer and use it in GitHub Desktop.
Save EvilFreelancer/bef91f8c03452cee5799640580297d77 to your computer and use it in GitHub Desktop.
Сохранение HTML страниц из интернета в Markdown через Firefox и Selenium
import os
import os.path
import urllib.parse
import zipfile
import datetime
from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import click
from bs4 import BeautifulSoup
from markdownify import ATX, MarkdownConverter
from readability import Document
import slugify
class ImageBlockConverter(MarkdownConverter):
def convert_img(self, el, text, convert_as_inline):
"""
Create a custom MarkdownConverter that adds newline after an image
"""
return super().convert_img(el, text, convert_as_inline) + '\n'
def md(soup, **options):
return ImageBlockConverter(**options).convert_soup(soup)
@click.command()
@click.argument('url')
@click.option('-z', '--zip', is_flag=True)
@click.option('-r', '--root', default=os.getcwd())
def save_page(url, zip, root):
# Настройка драйвера Firefox
options = webdriver.FirefoxOptions()
options.add_argument('--headless')
driver_service = FirefoxService(executable_path='/usr/bin/geckodriver')
with webdriver.Firefox(service=driver_service, options=options) as driver:
initial_url = f"about:reader?url={url}"
driver.get(initial_url)
# Ожидание изменения URL (редиректа)
try:
# WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
WebDriverWait(driver, 5).until(
lambda d: d.current_url != initial_url and len(d.find_element(By.TAG_NAME, 'body').text.strip()) > 0
)
except Exception as e:
click.echo(f"Timeout or error occurred: {e}", err=True)
response_text = driver.page_source
final_url = driver.current_url
doc = Document(response_text)
nice_name = slugify.slugify(doc.title())
doc_dom = BeautifulSoup(doc.summary(), features='lxml')
tempdir = os.path.join(root, nice_name)
if not os.path.exists(tempdir):
os.makedirs(tempdir, exist_ok=True)
# Сохранение изображений
for i, img in enumerate(doc_dom.find_all('img')):
img_url = urllib.parse.urlparse(img['src'])
_, extension = os.path.splitext(img_url.path)
if not extension:
click.echo(f'No file extension for img src, leaving as is: {img_url.geturl()}', err=True)
else:
if img_url.hostname:
img_resp = requests.get(img_url.geturl())
else:
img_resp = requests.get(urllib.parse.urljoin(final_url, img_url.geturl()))
assert img_resp.ok
saved_path = f'image_{i}{extension}'
with open(os.path.join(tempdir, saved_path), 'wb') as f:
f.write(img_resp.content)
img['src'] = saved_path
# Обновление относительных ссылок на указание на старое содержимое
for a in doc_dom.find_all('a'):
if 'href' not in a:
click.echo(f'Malformed a tag, skipping: {str(a)}', err=True)
continue
a_url = urllib.parse.urlparse(a['href'])
if not a_url.hostname:
a['href'] = urllib.parse.urljoin(final_url, a['href'])
# Конвертация HTML в Markdown
markdown_content = md(
doc_dom,
strip=['style', 'script'],
extensions=['fenced_code'],
heading_style=ATX
)
# Сохранение файла Markdown
with open(os.path.join(tempdir, 'content.md'), 'w', encoding='utf-8') as f:
f.write(markdown_content)
# Создание ZIP архива с содержимым страницы и изображениями
if zip:
with zipfile.ZipFile(nice_name + '.zip', 'w') as zipf:
zipf.writestr(
os.path.join(nice_name, 'meta.txt'),
'\n'.join([final_url, str(datetime.datetime.now(datetime.UTC))])
)
for p in os.listdir(tempdir):
zipf.write(
os.path.join(tempdir, p),
arcname=os.path.join(nice_name, p)
)
if __name__ == '__main__':
save_page()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment