Skip to content

Instantly share code, notes, and snippets.

@DFilyushin
Created September 16, 2025 12:30
Show Gist options
  • Select an option

  • Save DFilyushin/f8c69f8840751f6da899a66c436674aa to your computer and use it in GitHub Desktop.

Select an option

Save DFilyushin/f8c69f8840751f6da899a66c436674aa to your computer and use it in GitHub Desktop.
Сгружаем статьи с habr.ru
import asyncio
from pathlib import Path
from aiohttp import ClientSession
from bs4 import BeautifulSoup
from html_to_markdown import convert_to_markdown
from loguru import logger
ALLOWED_ALPHABET = " qwertyuiopasdfghjklzxcvbnmйцукенгшщзфывапролджэячсмитьбюхъё"
BASE_URL = "http://habr.com/"
START = 946696
async def get_article(num: int) -> dict | None:
async with (
ClientSession(base_url=BASE_URL) as session,
session.get(f"ru/articles/{num}") as r,
):
soup = BeautifulSoup(await r.text(), "html.parser")
title = soup.find("h1", attrs={"class": "tm-title tm-title_h1"})
if not title:
return None
text = convert_to_markdown(soup.find("div", attrs={"class": "tm-article-body"}))
views = soup.find("span", attrs={"class": "tm-icon-counter__value"}).text
time = soup.find(
"span",
attrs={"class": "tm-article-reading-time__label"},
).text
data = soup.find(
"span",
attrs={"class": "tm-article-datetime-published"},
).text
author = None
a = soup.find("a", attrs={"class": "tm-user-info__username"})
if a:
author = a.text.replace('"', "").strip()
difficulty = None
a = soup.find("span", attrs={"class": "tm-article-complexity__label"})
if a:
difficulty = a.text
return {
"title": title.text,
"text": text,
"difficulty": difficulty,
"views": views,
"time": time,
"data": data,
"author": author,
}
def save(a: dict) -> None:
filename = "".join([i for i in a["title"] if i.lower() in ALLOWED_ALPHABET])
with Path(f"articles/{filename}.md").open("w") as f:
f.write(
"---\n"
f"{"tags: " + a['difficulty'] if a['difficulty'] else ''}\n"
f"views: {a['views']}\n"
f"time: {a['time']}\n"
f"data: {a['data']}\n"
"---\n\n"
f"{"автор: [[users/" + a['author'] + "]]\n\n" if a['author'] else ''}"
f"{a['text'].replace('https://habr.ru/users/', 'users/')}",
)
async def fetch(i: int) -> None:
try:
a = await get_article(i)
if not a:
logger.debug("[{}:{}] 404", i, 946696 - i)
return
logger.success("[{}:{}] ok", i, 946696 - i)
save(a)
except Exception as e: # noqa: BLE001
logger.error("[{}:{}] {}", i, 946696 - i, e)
await asyncio.sleep(15)
async def main() -> None:
logger.add("obsa-habr.log")
# 946696 - start
for i in range(864008, 0, -2):
await fetch(i)
if __name__ == "__main__":
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment