Created
September 16, 2025 12:30
-
-
Save DFilyushin/f8c69f8840751f6da899a66c436674aa to your computer and use it in GitHub Desktop.
Сгружаем статьи с habr.ru
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import asyncio | |
| from pathlib import Path | |
| from aiohttp import ClientSession | |
| from bs4 import BeautifulSoup | |
| from html_to_markdown import convert_to_markdown | |
| from loguru import logger | |
| ALLOWED_ALPHABET = " qwertyuiopasdfghjklzxcvbnmйцукенгшщзфывапролджэячсмитьбюхъё" | |
| BASE_URL = "http://habr.com/" | |
| START = 946696 | |
| async def get_article(num: int) -> dict | None: | |
| async with ( | |
| ClientSession(base_url=BASE_URL) as session, | |
| session.get(f"ru/articles/{num}") as r, | |
| ): | |
| soup = BeautifulSoup(await r.text(), "html.parser") | |
| title = soup.find("h1", attrs={"class": "tm-title tm-title_h1"}) | |
| if not title: | |
| return None | |
| text = convert_to_markdown(soup.find("div", attrs={"class": "tm-article-body"})) | |
| views = soup.find("span", attrs={"class": "tm-icon-counter__value"}).text | |
| time = soup.find( | |
| "span", | |
| attrs={"class": "tm-article-reading-time__label"}, | |
| ).text | |
| data = soup.find( | |
| "span", | |
| attrs={"class": "tm-article-datetime-published"}, | |
| ).text | |
| author = None | |
| a = soup.find("a", attrs={"class": "tm-user-info__username"}) | |
| if a: | |
| author = a.text.replace('"', "").strip() | |
| difficulty = None | |
| a = soup.find("span", attrs={"class": "tm-article-complexity__label"}) | |
| if a: | |
| difficulty = a.text | |
| return { | |
| "title": title.text, | |
| "text": text, | |
| "difficulty": difficulty, | |
| "views": views, | |
| "time": time, | |
| "data": data, | |
| "author": author, | |
| } | |
| def save(a: dict) -> None: | |
| filename = "".join([i for i in a["title"] if i.lower() in ALLOWED_ALPHABET]) | |
| with Path(f"articles/{filename}.md").open("w") as f: | |
| f.write( | |
| "---\n" | |
| f"{"tags: " + a['difficulty'] if a['difficulty'] else ''}\n" | |
| f"views: {a['views']}\n" | |
| f"time: {a['time']}\n" | |
| f"data: {a['data']}\n" | |
| "---\n\n" | |
| f"{"автор: [[users/" + a['author'] + "]]\n\n" if a['author'] else ''}" | |
| f"{a['text'].replace('https://habr.ru/users/', 'users/')}", | |
| ) | |
| async def fetch(i: int) -> None: | |
| try: | |
| a = await get_article(i) | |
| if not a: | |
| logger.debug("[{}:{}] 404", i, 946696 - i) | |
| return | |
| logger.success("[{}:{}] ok", i, 946696 - i) | |
| save(a) | |
| except Exception as e: # noqa: BLE001 | |
| logger.error("[{}:{}] {}", i, 946696 - i, e) | |
| await asyncio.sleep(15) | |
| async def main() -> None: | |
| logger.add("obsa-habr.log") | |
| # 946696 - start | |
| for i in range(864008, 0, -2): | |
| await fetch(i) | |
| if __name__ == "__main__": | |
| asyncio.run(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment