3outeille · January 26, 2022 19:48
diff --git a/parse_hackmd.md b/parse_hackmd.md
diff --git a/parse_hackmd.py b/parse_hackmd.py
 import glob
 import re
 import itertools
 import os
 import argparse
 import zipfile
 import shutil
 import subprocess
 import filecmp
 import logging
 import sys

 def progressbar(it, prefix="", size=60, file=sys.stdout):
    #https://stackoverflow.com/a/34482761
    count = len(it)
    def show(j):
        x = int(size*j/count)
        file.write("%s[%s%s] %i/%i\r" % (prefix, "#"*x, "."*(size-x), j, count))
        file.flush()        
    show(0)
    for i, item in enumerate(it):
        yield item
        show(i+1)
    file.write("\n")
    file.flush()

 logger = logging.getLogger(__name__)
 logging.basicConfig(format='%(asctime)s - %(message)s', filename='log.txt', filemode='w', level=logging.INFO)

 def getTags(lines):
    if len(lines) < 2: return []
    is_tags = re.search(r"tags:(.*)\n", lines[1])
    if is_tags is None: return []
    tags = list(is_tags.groups()[0].replace(" ", "").split(","))
    if len(tags) == 1 and tags[0] == "":
        return []
    return tags

 def getPath(tags):
    if len(tags) == 0:
        return "no_tags"
    else:
        paths = []
        for permut in itertools.permutations(tags, r=len(tags)):
            tmp_path = "/".join(permut)
            if len(paths) == 0 or os.path.exists(tmp_path):
                paths.append(tmp_path)

        # If one of the path already exists, use it (there should only be one).
        # Else use the first one proposed.
        if len(paths) == 0:
            raise ValueError("Path formed with these tags should be unique.")
        else:
            path = paths[-1]

        return path

 def preprocess_filename(filename):
    replacements = {
        "[-!$%^&*()+|~=`{}\[\]:\";'<>?,.\/ ]" : "_"
    }
    for pattern, repl in replacements.items():
        # re.I =. Ignore case
        filename = re.sub(pattern, repl, filename, flags=re.I)
    return filename

 def archive_markdown(files):
    count_parsed_files = 0

    for i in progressbar(range(len(files)), "File parsed: "):
        file = files[i]
        with open(file) as f:
            lines = f.readlines()
            # empty file
            if len(lines) == 0:
                logger.warning("[WARN] {} is an empty file. It will not be created".format(file))
                count_parsed_files += 1
                continue
            
            tags = getTags(lines)
            path = getPath(tags)
                
            if not file.endswith(".md"):
                logger.error("[ERROR] {} is not a markdown file".format(file))
                continue

            # Create associate folder for file
            filename, _ = file.split("/")[-1].split(".md")
            filename = preprocess_filename(filename)
            fullpath = path + "/" + filename
            fullpath_filename_extension = fullpath + "/" + filename + ".md"

            # Do nothing if no diff with previous version of file
            if os.path.exists(fullpath_filename_extension) and filecmp.cmp(file, fullpath_filename_extension):
                logger.info("[INFO] '{}' no change.".format(file))
                count_parsed_files += 1
                continue
            else:
                if os.path.exists(fullpath):
                    shutil.rmtree(fullpath)
                os.makedirs(fullpath)

                # copy file to associate folder
                shutil.copy(file, fullpath_filename_extension)
            
                # Parse image url
                output = None
                try:
                    ps = subprocess.Popen("grep -oP \"!\[\]\(https://i.imgur.com/.+.png\)\" {}".format(fullpath_filename_extension), shell=True, stdout=subprocess.PIPE)
                    output = subprocess.check_output("grep -oP \"(?=h)[^\)]*\"", shell=True, stdin=ps.stdout)
                    ps.wait()
                except subprocess.CalledProcessError:
                    output = None
                    logger.warning("[WARN] {} has no image url to parse".format(file))

                if output is not None:
                    image_urls = output.decode("ascii").split("\n")
                    image_urls = list(filter(lambda x: len(x) > 0, image_urls))
                    
                    for i, url in enumerate(image_urls):
                        try:
                            subprocess.run("wget -q -O {}.png {}".format(fullpath + "/" + str(i), url), shell=True, check=True)
                        except subprocess.CalledProcessError:
                            if os.path.exists(fullpath + "/" + str(i) + ".png"):
                                os.remove(fullpath + "/" + str(i) + ".png")
                            logger.error("[ERROR] {} of '{}' is not valid.".format(url, file))

                logger.info("[DONE] '{}'".format(file))
                logger.info("-"*20)
                count_parsed_files += 1
    
    logger.info("[END] Ratio parsed_files/total_files: {} / {}".format(count_parsed_files, len(files)))

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--zip_file")
    args = parser.parse_args()

    if os.path.exists("tmp/"):
        shutil.rmtree("tmp/")
    os.makedirs("tmp/")

    with zipfile.ZipFile(args.zip_file, 'r') as zip_ref:
        zip_ref.extractall("tmp/")

    files = glob.glob("tmp/*.md")
    archive_markdown(files)
    print("Informations are available in log.txt")
	import glob
	import re
	import itertools
	import os
	import argparse
	import zipfile
	import shutil
	import subprocess
	import filecmp
	import logging
	import sys

	def progressbar(it, prefix="", size=60, file=sys.stdout):
	#https://stackoverflow.com/a/34482761
	count = len(it)
	def show(j):
	x = int(size*j/count)
	file.write("%s[%s%s] %i/%i\r" % (prefix, "#"x, "."(size-x), j, count))
	file.flush()
	show(0)
	for i, item in enumerate(it):
	yield item
	show(i+1)
	file.write("\n")
	file.flush()

	logger = logging.getLogger(__name__)
	logging.basicConfig(format='%(asctime)s - %(message)s', filename='log.txt', filemode='w', level=logging.INFO)

	def getTags(lines):
	if len(lines) < 2: return []
	is_tags = re.search(r"tags:(.*)\n", lines[1])
	if is_tags is None: return []
	tags = list(is_tags.groups()[0].replace(" ", "").split(","))
	if len(tags) == 1 and tags[0] == "":
	return []
	return tags

	def getPath(tags):
	if len(tags) == 0:
	return "no_tags"
	else:
	paths = []
	for permut in itertools.permutations(tags, r=len(tags)):
	tmp_path = "/".join(permut)
	if len(paths) == 0 or os.path.exists(tmp_path):
	paths.append(tmp_path)

	# If one of the path already exists, use it (there should only be one).
	# Else use the first one proposed.
	if len(paths) == 0:
	raise ValueError("Path formed with these tags should be unique.")
	else:
	path = paths[-1]

	return path

	def preprocess_filename(filename):
	replacements = {
	"[-!$%^&*()+\|~=`{}\[\]:\";'<>?,.\/ ]" : "_"
	}
	for pattern, repl in replacements.items():
	# re.I =. Ignore case
	filename = re.sub(pattern, repl, filename, flags=re.I)
	return filename

	def archive_markdown(files):
	count_parsed_files = 0

	for i in progressbar(range(len(files)), "File parsed: "):
	file = files[i]
	with open(file) as f:
	lines = f.readlines()
	# empty file
	if len(lines) == 0:
	logger.warning("[WARN] {} is an empty file. It will not be created".format(file))
	count_parsed_files += 1
	continue

	tags = getTags(lines)
	path = getPath(tags)

	if not file.endswith(".md"):
	logger.error("[ERROR] {} is not a markdown file".format(file))
	continue

	# Create associate folder for file
	filename, _ = file.split("/")[-1].split(".md")
	filename = preprocess_filename(filename)
	fullpath = path + "/" + filename
	fullpath_filename_extension = fullpath + "/" + filename + ".md"

	# Do nothing if no diff with previous version of file
	if os.path.exists(fullpath_filename_extension) and filecmp.cmp(file, fullpath_filename_extension):
	logger.info("[INFO] '{}' no change.".format(file))
	count_parsed_files += 1
	continue
	else:
	if os.path.exists(fullpath):
	shutil.rmtree(fullpath)
	os.makedirs(fullpath)

	# copy file to associate folder
	shutil.copy(file, fullpath_filename_extension)

	# Parse image url
	output = None
	try:
	ps = subprocess.Popen("grep -oP \"!\[\]\(https://i.imgur.com/.+.png\)\" {}".format(fullpath_filename_extension), shell=True, stdout=subprocess.PIPE)
	output = subprocess.check_output("grep -oP \"(?=h)[^\)]*\"", shell=True, stdin=ps.stdout)
	ps.wait()
	except subprocess.CalledProcessError:
	output = None
	logger.warning("[WARN] {} has no image url to parse".format(file))

	if output is not None:
	image_urls = output.decode("ascii").split("\n")
	image_urls = list(filter(lambda x: len(x) > 0, image_urls))

	for i, url in enumerate(image_urls):
	try:
	subprocess.run("wget -q -O {}.png {}".format(fullpath + "/" + str(i), url), shell=True, check=True)
	except subprocess.CalledProcessError:
	if os.path.exists(fullpath + "/" + str(i) + ".png"):
	os.remove(fullpath + "/" + str(i) + ".png")
	logger.error("[ERROR] {} of '{}' is not valid.".format(url, file))

	logger.info("[DONE] '{}'".format(file))
	logger.info("-"*20)
	count_parsed_files += 1

	logger.info("[END] Ratio parsed_files/total_files: {} / {}".format(count_parsed_files, len(files)))

	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument("--zip_file")
	args = parser.parse_args()

	if os.path.exists("tmp/"):
	shutil.rmtree("tmp/")
	os.makedirs("tmp/")

	with zipfile.ZipFile(args.zip_file, 'r') as zip_ref:
	zip_ref.extractall("tmp/")

	files = glob.glob("tmp/*.md")
	archive_markdown(files)
	print("Informations are available in log.txt")