Last active
July 6, 2022 16:10
-
-
Save ChrisVilches/ab62b0f39a5af57260e7ed8dadaa1e41 to your computer and use it in GitHub Desktop.
Clean HTML files generated by the Windows Kindle app when exporting highlights.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import re | |
# TODO: The main code should be inside a __main__, I think. | |
FILE = sys.argv[1] | |
file = open(FILE, "r", encoding="utf8") | |
data = file.read() | |
file.close() | |
if ( | |
("class='bookTitle'" not in data) | |
or ("class='noteHeading'" not in data) | |
or (not FILE.endswith(".html")) | |
): | |
raise Exception("It seems this file is not a Kindle highlight HTML file.") | |
original_data = data | |
replacement_rules = { | |
" .": ".", | |
" ,": ",", | |
"( ": "(", | |
" )": ")", | |
" :": ":", | |
" ;": ";", | |
" ?": "?", | |
" !": "!", | |
" - ": "-", | |
" / ": "/", | |
"“ ": "“", | |
" ”": "”", | |
} | |
replacement_rules_regex = { | |
"# ([0-9])": "#\\1" | |
} | |
for key, value in replacement_rules.items(): | |
data = data.replace(key, value) | |
for key, value in replacement_rules_regex.items(): | |
data = re.sub(key, value, data) | |
if original_data == data: | |
print("Data didn't change.") | |
else: | |
print("Data changed.") | |
file = open(FILE, "w", encoding="utf8") | |
file.write(data) | |
file.close() | |
print("OK") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment