Created
May 19, 2023 07:38
-
-
Save hongqn/961e6c71c5175c708666943f9963683a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Mirror https://haoel.blog.csdn.net/ to local directory.""" | |
import time | |
from itertools import count | |
from subprocess import run | |
from typing import Iterator | |
import requests | |
def iter_article_urls() -> Iterator[str]: | |
"""Collect article URLs from the json api.""" | |
size = 100 | |
for page in count(1): | |
response = requests.get( | |
"https://blog.csdn.net/community/home-api/v1/get-business-list", | |
dict(page=page, size=size, businessType="blog", username="haoel"), | |
headers={"Accept": "application/json", "User-Agent": "MSIE"}, | |
) | |
response.raise_for_status() | |
for article in response.json()["data"]["list"]: | |
yield article["url"] | |
if len(response.json()["data"]["list"]) < size: | |
break | |
def download_articls(wait: float = 2): | |
"""Main entry point.""" | |
for url in iter_article_urls(): | |
print("Downloading", url) | |
cmd = [ | |
"wget", | |
"--mirror", | |
"--execute=robots=off", | |
"--convert-links", | |
"--adjust-extension", | |
"--page-requisites", | |
"--no-parent", | |
f"--wait={wait}", | |
"--random-wait", | |
"--user-agent=MSIE", | |
url, | |
] | |
run(cmd) | |
time.sleep(wait) | |
def main(): | |
download_articls() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment