Last active
July 1, 2021 02:02
-
-
Save aymkx/c2c845ad6bae7e1c45513186f0045a4c to your computer and use it in GitHub Desktop.
app.netのアーカイブを取得するスクリプト。asyncioのこと何もわかってないのでかなり雑。まともな書き方を教えてほしい
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import asyncio | |
import io | |
import json | |
import os | |
from pathlib import Path | |
from typing import Optional | |
import aiohttp | |
from yarl import URL | |
USER = "<username>" | |
POSTS_MAP = os.path.join("/users", USER[0], USER + ".txt") | |
url = URL("https://adn.micro.blog/") | |
FINGERPRINT_SHA256 = bytes.fromhex( | |
"EB:29:57:29:59:C2:8E:11:F2:59:9E:24:DB:1A:28:5C:4F:04:D7:FB:56:BA:AC:80:0B:F1:CB:0E:56:38:9B:86".replace( | |
":", "" | |
) | |
) | |
ssl = aiohttp.Fingerprint(FINGERPRINT_SHA256) | |
mapfile = os.path.basename(POSTS_MAP) | |
def check_insecure_path(path: str): | |
if ".." in path: | |
raise Exception(f"Path {path} is insecure") | |
async def store(resp: aiohttp.ClientResponse, file: Optional[str] = None): | |
file = file or resp.url.path.lstrip("/") | |
check_insecure_path(file) | |
Path(file).parent.mkdir(parents=True, exist_ok=True) | |
if resp.content_type == "application/json": | |
with open(file, "w") as f: | |
json.dump(await resp.json(), f, ensure_ascii=False, indent=2) | |
f.write("\n") | |
else: | |
with open(file, "wb") as f: | |
try: | |
buf_size = os.stat(f.fileno()).st_blksize | |
except OSError: | |
buf_size = io.DEFAULT_BUFFER_SIZE | |
while True: | |
chunk = await resp.content.read(buf_size) | |
if not chunk: | |
break | |
f.write(chunk) | |
async def get_postdata(s: aiohttp.ClientSession, path: str): | |
print(f"Getting {path}...") | |
async with s.get(url.with_path(path)) as resp: | |
if resp.status != 200: | |
print(f"Getting {path} failed: {resp.reason}") | |
return | |
print(f"Storing {path}...") | |
await store(resp) | |
async def main(): | |
mapfile = os.path.basename(POSTS_MAP) | |
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=ssl)) as s: | |
print("Getting map file...") | |
async with s.get( | |
url.with_path(POSTS_MAP), raise_for_status=True | |
) as get_posts_map: | |
await store(get_posts_map, mapfile) | |
print("Getting map file finished.") | |
print("Starting to get post data...") | |
tasks = [] | |
with open(mapfile) as f: | |
for line in f: | |
postfile = line.strip() | |
if not postfile: | |
continue | |
tasks.append(asyncio.create_task(get_postdata(s, postfile))) | |
await asyncio.gather(*tasks) | |
asyncio.run(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
アーカイブについては https://www.manton.org/2020/05/11/appnet-archive-now.html の記事にありました。その後進展がないようで、Let's Encryptで取得された証明書も1月に有効期限が切れており、いつまで取得できるかわかりません。
想定動作はカレントディレクトリ下に.txtを保存し、それを頼りに投稿データをposts以下に保存することです。純粋に投稿データのみのダウンロードで画像等の回収はしていません。