|
import re |
|
import shutil |
|
from pathlib import Path |
|
from tempfile import NamedTemporaryFile |
|
from urllib.parse import urlparse |
|
|
|
import requests |
|
|
|
REGEX_IMAGE = re.compile(r'!\[(?P<comment>[^\]]*)\]\((?P<filename>.*?)(?=\"|\))(?P<optionalpart>\".*\")?\)') |
|
WHITELIST = [] |
|
LOCAL_CDN_DOMAIN = "your-cdn.domain" |
|
|
|
p_posts = Path("./_posts") |
|
p_imgcdn = Path("../../img-cdn-local") |
|
p_export = Path("./_export") |
|
|
|
|
|
p_export.mkdir(parents=True, exist_ok=True) |
|
|
|
for p in p_posts.rglob("*.md"): |
|
with p.open(encoding="utf-8") as f: |
|
content = f.read() |
|
images = list(REGEX_IMAGE.finditer(content)) |
|
existed_image = set() |
|
dstdir = p_export / p.relative_to(p_posts).with_suffix("") |
|
dstdir.mkdir(parents=True, exist_ok=True) |
|
dstmd = dstdir / "index.zh.md" |
|
|
|
for i, img in enumerate(images): |
|
fpath = img.group("filename") |
|
if fpath in existed_image: |
|
print(p, img, "duplicated") |
|
continue |
|
elif not fpath: |
|
print(p, img, "empty") |
|
continue |
|
else: |
|
existed_image.add(fpath) |
|
if fpath.startswith("http"): |
|
url = urlparse(fpath) |
|
if url.netloc == LOCAL_CDN_DOMAIN: |
|
fpath = p_imgcdn / url.path[1:] |
|
assert fpath.exists(), f"{p}: CDN image {url.path} should exist on {fpath}" |
|
print(p, img, "found CDN image") |
|
elif url.netloc not in WHITELIST: |
|
print(p, img, "skipped not in whitelist.") |
|
continue |
|
else: |
|
resp = requests.get(fpath) |
|
suffix = Path(url.path).suffix |
|
with NamedTemporaryFile(suffix=suffix, delete=False) as tmpfile: |
|
tmpfile.write(resp.content) |
|
fpath = Path(tmpfile.name) |
|
print(p, img, "downloaded to", fpath) |
|
else: |
|
fpath = Path(fpath) |
|
print(p, img, "is local image") |
|
if not fpath.exists(): |
|
print(p, img, "not exists") |
|
continue |
|
|
|
dst = dstdir / f"{i:02d}{fpath.suffix}" |
|
shutil.copyfile(fpath, dst) |
|
|
|
cmt = img.group('comment') |
|
if (opt := img.group('optionalpart')): |
|
opt = f" {opt}" |
|
else: |
|
opt = "" |
|
content = content.replace(img.group(0), f"data:image/s3,"s3://crabby-images/32310/3231026451e27cf3b5020a1ecefe068bad1ec4de" alt="{cmt}"") |
|
|
|
with dstmd.open("w", encoding="utf-8") as f: |
|
f.write(content) |