Skip to content

Instantly share code, notes, and snippets.

@aymkx
Last active July 1, 2021 02:02
Show Gist options
  • Save aymkx/c2c845ad6bae7e1c45513186f0045a4c to your computer and use it in GitHub Desktop.
Save aymkx/c2c845ad6bae7e1c45513186f0045a4c to your computer and use it in GitHub Desktop.
app.netのアーカイブを取得するスクリプト。asyncioのこと何もわかってないのでかなり雑。まともな書き方を教えてほしい
#!/usr/bin/env python3
import asyncio
import io
import json
import os
from pathlib import Path
from typing import Optional
import aiohttp
from yarl import URL
USER = "<username>"
POSTS_MAP = os.path.join("/users", USER[0], USER + ".txt")
url = URL("https://adn.micro.blog/")
FINGERPRINT_SHA256 = bytes.fromhex(
"EB:29:57:29:59:C2:8E:11:F2:59:9E:24:DB:1A:28:5C:4F:04:D7:FB:56:BA:AC:80:0B:F1:CB:0E:56:38:9B:86".replace(
":", ""
)
)
ssl = aiohttp.Fingerprint(FINGERPRINT_SHA256)
mapfile = os.path.basename(POSTS_MAP)
def check_insecure_path(path: str):
if ".." in path:
raise Exception(f"Path {path} is insecure")
async def store(resp: aiohttp.ClientResponse, file: Optional[str] = None):
file = file or resp.url.path.lstrip("/")
check_insecure_path(file)
Path(file).parent.mkdir(parents=True, exist_ok=True)
if resp.content_type == "application/json":
with open(file, "w") as f:
json.dump(await resp.json(), f, ensure_ascii=False, indent=2)
f.write("\n")
else:
with open(file, "wb") as f:
try:
buf_size = os.stat(f.fileno()).st_blksize
except OSError:
buf_size = io.DEFAULT_BUFFER_SIZE
while True:
chunk = await resp.content.read(buf_size)
if not chunk:
break
f.write(chunk)
async def get_postdata(s: aiohttp.ClientSession, path: str):
print(f"Getting {path}...")
async with s.get(url.with_path(path)) as resp:
if resp.status != 200:
print(f"Getting {path} failed: {resp.reason}")
return
print(f"Storing {path}...")
await store(resp)
async def main():
mapfile = os.path.basename(POSTS_MAP)
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=ssl)) as s:
print("Getting map file...")
async with s.get(
url.with_path(POSTS_MAP), raise_for_status=True
) as get_posts_map:
await store(get_posts_map, mapfile)
print("Getting map file finished.")
print("Starting to get post data...")
tasks = []
with open(mapfile) as f:
for line in f:
postfile = line.strip()
if not postfile:
continue
tasks.append(asyncio.create_task(get_postdata(s, postfile)))
await asyncio.gather(*tasks)
asyncio.run(main())
@aymkx
Copy link
Author

aymkx commented Jun 30, 2021

アーカイブについては https://www.manton.org/2020/05/11/appnet-archive-now.html の記事にありました。その後進展がないようで、Let's Encryptで取得された証明書も1月に有効期限が切れており、いつまで取得できるかわかりません。

想定動作はカレントディレクトリ下に.txtを保存し、それを頼りに投稿データをposts以下に保存することです。純粋に投稿データのみのダウンロードで画像等の回収はしていません。

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment