Created
September 24, 2021 08:57
-
-
Save taicaile/fe39bf0bad57c4d464b172e0e35efc2e to your computer and use it in GitHub Desktop.
A script that removes the watermarks from the HTML file generated by MindMaster. It is for study only, you may subscript the paid plan if you want to public the file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
""" | |
A script that removes the watermarks from the HTML file generated by MindMaster. It is for study only, you may subscript the paid plan if you wnat to public the file. | |
""" | |
import os | |
import re | |
import sys | |
import argparse | |
parser = argparse.ArgumentParser(description=__doc__) | |
parser.add_argument("file", type=str, help="the HTML file path") | |
args = parser.parse_args() | |
def read_file(file): | |
""" | |
return file content if file exists, otherwise raise exception | |
""" | |
if not os.path.exists(file): | |
print(f"{file} doesn't found, exit...") | |
sys.exit(-1) | |
if not os.path.isfile(file): | |
print(f"{file} is not file, exit...") | |
sys.exit(-1) | |
with open(file, "r", encoding="utf-8") as r_f: | |
text = r_f.read() | |
return text | |
html = read_file(args.file) | |
regexes = [r"<svg\sxmlns=.*?</svg>", r"<div\sid=\"copyright\">.*</div>"] | |
EMPTY = "" | |
for regex in regexes: | |
html = re.sub(regex, EMPTY, html, 0, re.MULTILINE | re.DOTALL) | |
STEM, EXT = os.path.splitext(args.file) | |
NEW_FILE = STEM + ".clean" + EXT | |
with open(NEW_FILE, "w+", encoding="utf-8") as wf: | |
wf.write(html) | |
print(f"The watermarks were removed, please check the new file {NEW_FILE}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment