Last active
November 23, 2024 12:17
-
-
Save EvilFreelancer/bef91f8c03452cee5799640580297d77 to your computer and use it in GitHub Desktop.
Сохранение HTML страниц из интернета в Markdown через Firefox и Selenium
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import os.path | |
import urllib.parse | |
import zipfile | |
import datetime | |
from selenium import webdriver | |
from selenium.webdriver.firefox.service import Service as FirefoxService | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
import requests | |
import click | |
from bs4 import BeautifulSoup | |
from markdownify import ATX, MarkdownConverter | |
from readability import Document | |
import slugify | |
class ImageBlockConverter(MarkdownConverter): | |
def convert_img(self, el, text, convert_as_inline): | |
""" | |
Create a custom MarkdownConverter that adds newline after an image | |
""" | |
return super().convert_img(el, text, convert_as_inline) + '\n' | |
def md(soup, **options): | |
return ImageBlockConverter(**options).convert_soup(soup) | |
@click.command() | |
@click.argument('url') | |
@click.option('-z', '--zip', is_flag=True) | |
@click.option('-r', '--root', default=os.getcwd()) | |
def save_page(url, zip, root): | |
# Настройка драйвера Firefox | |
options = webdriver.FirefoxOptions() | |
options.add_argument('--headless') | |
driver_service = FirefoxService(executable_path='/usr/bin/geckodriver') | |
with webdriver.Firefox(service=driver_service, options=options) as driver: | |
initial_url = f"about:reader?url={url}" | |
driver.get(initial_url) | |
# Ожидание изменения URL (редиректа) | |
try: | |
# WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body'))) | |
WebDriverWait(driver, 5).until( | |
lambda d: d.current_url != initial_url and len(d.find_element(By.TAG_NAME, 'body').text.strip()) > 0 | |
) | |
except Exception as e: | |
click.echo(f"Timeout or error occurred: {e}", err=True) | |
response_text = driver.page_source | |
final_url = driver.current_url | |
doc = Document(response_text) | |
nice_name = slugify.slugify(doc.title()) | |
doc_dom = BeautifulSoup(doc.summary(), features='lxml') | |
tempdir = os.path.join(root, nice_name) | |
if not os.path.exists(tempdir): | |
os.makedirs(tempdir, exist_ok=True) | |
# Сохранение изображений | |
for i, img in enumerate(doc_dom.find_all('img')): | |
img_url = urllib.parse.urlparse(img['src']) | |
_, extension = os.path.splitext(img_url.path) | |
if not extension: | |
click.echo(f'No file extension for img src, leaving as is: {img_url.geturl()}', err=True) | |
else: | |
if img_url.hostname: | |
img_resp = requests.get(img_url.geturl()) | |
else: | |
img_resp = requests.get(urllib.parse.urljoin(final_url, img_url.geturl())) | |
assert img_resp.ok | |
saved_path = f'image_{i}{extension}' | |
with open(os.path.join(tempdir, saved_path), 'wb') as f: | |
f.write(img_resp.content) | |
img['src'] = saved_path | |
# Обновление относительных ссылок на указание на старое содержимое | |
for a in doc_dom.find_all('a'): | |
if 'href' not in a: | |
click.echo(f'Malformed a tag, skipping: {str(a)}', err=True) | |
continue | |
a_url = urllib.parse.urlparse(a['href']) | |
if not a_url.hostname: | |
a['href'] = urllib.parse.urljoin(final_url, a['href']) | |
# Конвертация HTML в Markdown | |
markdown_content = md( | |
doc_dom, | |
strip=['style', 'script'], | |
extensions=['fenced_code'], | |
heading_style=ATX | |
) | |
# Сохранение файла Markdown | |
with open(os.path.join(tempdir, 'content.md'), 'w', encoding='utf-8') as f: | |
f.write(markdown_content) | |
# Создание ZIP архива с содержимым страницы и изображениями | |
if zip: | |
with zipfile.ZipFile(nice_name + '.zip', 'w') as zipf: | |
zipf.writestr( | |
os.path.join(nice_name, 'meta.txt'), | |
'\n'.join([final_url, str(datetime.datetime.now(datetime.UTC))]) | |
) | |
for p in os.listdir(tempdir): | |
zipf.write( | |
os.path.join(tempdir, p), | |
arcname=os.path.join(nice_name, p) | |
) | |
if __name__ == '__main__': | |
save_page() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment