Skip to content

Instantly share code, notes, and snippets.

@hongqn
Created May 19, 2023 07:38
Show Gist options
  • Save hongqn/961e6c71c5175c708666943f9963683a to your computer and use it in GitHub Desktop.
Save hongqn/961e6c71c5175c708666943f9963683a to your computer and use it in GitHub Desktop.
"""Mirror https://haoel.blog.csdn.net/ to local directory."""
import time
from itertools import count
from subprocess import run
from typing import Iterator
import requests
def iter_article_urls() -> Iterator[str]:
"""Collect article URLs from the json api."""
size = 100
for page in count(1):
response = requests.get(
"https://blog.csdn.net/community/home-api/v1/get-business-list",
dict(page=page, size=size, businessType="blog", username="haoel"),
headers={"Accept": "application/json", "User-Agent": "MSIE"},
)
response.raise_for_status()
for article in response.json()["data"]["list"]:
yield article["url"]
if len(response.json()["data"]["list"]) < size:
break
def download_articls(wait: float = 2):
"""Main entry point."""
for url in iter_article_urls():
print("Downloading", url)
cmd = [
"wget",
"--mirror",
"--execute=robots=off",
"--convert-links",
"--adjust-extension",
"--page-requisites",
"--no-parent",
f"--wait={wait}",
"--random-wait",
"--user-agent=MSIE",
url,
]
run(cmd)
time.sleep(wait)
def main():
download_articls()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment