- Developped under Python 3.7.12
- Standalone script which parse hackmd markdown (including imgur link) and create directory (from tags) with downloaded imgur link
- To use,
python3 script.py --zip_file <file.zip>
- To export your hackmd markdown as zip file
Last active
January 26, 2022 19:48
-
-
Save 3outeille/1cd8997246fe2f906b9dddf2cf6820a6 to your computer and use it in GitHub Desktop.
Standalone script which parse hackmd markdown (including imgur link) and create directory (from tags) with downloaded imgur link
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import re | |
import itertools | |
import os | |
import argparse | |
import zipfile | |
import shutil | |
import subprocess | |
import filecmp | |
import logging | |
import sys | |
def progressbar(it, prefix="", size=60, file=sys.stdout): | |
#https://stackoverflow.com/a/34482761 | |
count = len(it) | |
def show(j): | |
x = int(size*j/count) | |
file.write("%s[%s%s] %i/%i\r" % (prefix, "#"*x, "."*(size-x), j, count)) | |
file.flush() | |
show(0) | |
for i, item in enumerate(it): | |
yield item | |
show(i+1) | |
file.write("\n") | |
file.flush() | |
logger = logging.getLogger(__name__) | |
logging.basicConfig(format='%(asctime)s - %(message)s', filename='log.txt', filemode='w', level=logging.INFO) | |
def getTags(lines): | |
if len(lines) < 2: return [] | |
is_tags = re.search(r"tags:(.*)\n", lines[1]) | |
if is_tags is None: return [] | |
tags = list(is_tags.groups()[0].replace(" ", "").split(",")) | |
if len(tags) == 1 and tags[0] == "": | |
return [] | |
return tags | |
def getPath(tags): | |
if len(tags) == 0: | |
return "no_tags" | |
else: | |
paths = [] | |
for permut in itertools.permutations(tags, r=len(tags)): | |
tmp_path = "/".join(permut) | |
if len(paths) == 0 or os.path.exists(tmp_path): | |
paths.append(tmp_path) | |
# If one of the path already exists, use it (there should only be one). | |
# Else use the first one proposed. | |
if len(paths) == 0: | |
raise ValueError("Path formed with these tags should be unique.") | |
else: | |
path = paths[-1] | |
return path | |
def preprocess_filename(filename): | |
replacements = { | |
"[-!$%^&*()+|~=`{}\[\]:\";'<>?,.\/ ]" : "_" | |
} | |
for pattern, repl in replacements.items(): | |
# re.I =. Ignore case | |
filename = re.sub(pattern, repl, filename, flags=re.I) | |
return filename | |
def archive_markdown(files): | |
count_parsed_files = 0 | |
for i in progressbar(range(len(files)), "File parsed: "): | |
file = files[i] | |
with open(file) as f: | |
lines = f.readlines() | |
# empty file | |
if len(lines) == 0: | |
logger.warning("[WARN] {} is an empty file. It will not be created".format(file)) | |
count_parsed_files += 1 | |
continue | |
tags = getTags(lines) | |
path = getPath(tags) | |
if not file.endswith(".md"): | |
logger.error("[ERROR] {} is not a markdown file".format(file)) | |
continue | |
# Create associate folder for file | |
filename, _ = file.split("/")[-1].split(".md") | |
filename = preprocess_filename(filename) | |
fullpath = path + "/" + filename | |
fullpath_filename_extension = fullpath + "/" + filename + ".md" | |
# Do nothing if no diff with previous version of file | |
if os.path.exists(fullpath_filename_extension) and filecmp.cmp(file, fullpath_filename_extension): | |
logger.info("[INFO] '{}' no change.".format(file)) | |
count_parsed_files += 1 | |
continue | |
else: | |
if os.path.exists(fullpath): | |
shutil.rmtree(fullpath) | |
os.makedirs(fullpath) | |
# copy file to associate folder | |
shutil.copy(file, fullpath_filename_extension) | |
# Parse image url | |
output = None | |
try: | |
ps = subprocess.Popen("grep -oP \"!\[\]\(https://i.imgur.com/.+.png\)\" {}".format(fullpath_filename_extension), shell=True, stdout=subprocess.PIPE) | |
output = subprocess.check_output("grep -oP \"(?=h)[^\)]*\"", shell=True, stdin=ps.stdout) | |
ps.wait() | |
except subprocess.CalledProcessError: | |
output = None | |
logger.warning("[WARN] {} has no image url to parse".format(file)) | |
if output is not None: | |
image_urls = output.decode("ascii").split("\n") | |
image_urls = list(filter(lambda x: len(x) > 0, image_urls)) | |
for i, url in enumerate(image_urls): | |
try: | |
subprocess.run("wget -q -O {}.png {}".format(fullpath + "/" + str(i), url), shell=True, check=True) | |
except subprocess.CalledProcessError: | |
if os.path.exists(fullpath + "/" + str(i) + ".png"): | |
os.remove(fullpath + "/" + str(i) + ".png") | |
logger.error("[ERROR] {} of '{}' is not valid.".format(url, file)) | |
logger.info("[DONE] '{}'".format(file)) | |
logger.info("-"*20) | |
count_parsed_files += 1 | |
logger.info("[END] Ratio parsed_files/total_files: {} / {}".format(count_parsed_files, len(files))) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--zip_file") | |
args = parser.parse_args() | |
if os.path.exists("tmp/"): | |
shutil.rmtree("tmp/") | |
os.makedirs("tmp/") | |
with zipfile.ZipFile(args.zip_file, 'r') as zip_ref: | |
zip_ref.extractall("tmp/") | |
files = glob.glob("tmp/*.md") | |
archive_markdown(files) | |
print("Informations are available in log.txt") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment