n8henrie · January 10, 2025 18:36
diff --git a/fix_joplin_md_links.py b/fix_joplin_md_links.py
 #!/usr/bin/env python3
 """
 Quick hack to fix links to local files in Markdown notes exported from Joplin 3.1.24.

 Run at your own risk, ideally on a duplicate copy of your exported notes, as
 this makes destructive changes that would be difficult to undo. This may break
 your notes.

 Further, special cases like parentheses in filenames are handled differently by
 different markdown renderers. You may have to alter the script depending on what
 viewer you're using.

 When Joplin exports notes to markdown, it has special logic any for links to local files.

 - files are placed into a `_resources` directory
 - the filenames are truncated and special characters removed for Windows friendliness: https://github.com/laurent22/joplin/blob/0cec4753d8afaa5673eba526c121d8b1e1062a5d/packages/lib/path-utils.ts#L77
 - the markdown links point to these sanitized filenames
 - the title of the markdown links seems to preserve the original filename

 Example: a joplin note with a link to `my long file.pdf` might turn into this markdown: `[my long file.pdf](my_long_file.pd)`

 On Linux / macOS this provides some inconveniences, as the sanitization
 unfortunately unnecessarily truncates filenames, replaces characters like spaces
 unnecessarily, and corrupts the file extensions, breaking their ability to
 automatically open in default apps.

 This script makes a best effort attempt to reverse this by:

 - recovering the original filename from the link title
 - using this information to rename the file
 - rewriting the markdown link in the note accordingly

 Python 3.13, no other dependencies

 Should be run from the root directory of the joplin export (containing
 `_resources` and `My Notes`)

 See also: https://github.com/laurent22/joplin/issues/11012#issuecomment-2581606039

 """

 import re
 from pathlib import Path
 from urllib.parse import urlparse

 RE_MD_LINK = re.compile(
    r"""
    (?P<img>!)? # maybe image tag
    \[ # opening bracket
    (?P<title>[^[\]]+?) # the link title e.g. `[title]`
    \]\( # closing bracket and opening parenthesis
    (?P<url>\.\./_resources/[^)]+?) # the link url e.g. `(url)`
    \) # closing parenthesis
    """,
    re.VERBOSE,
 )
 RE_FILE_W_EXT = re.compile(r"^(?P<base>.+)\.(?P<ext>\w{1,3})$")


 def main():
    md_files = Path("My Notes").glob("*.md")
    for f in md_files:
        text = f.read_text()

        # str is immutable; don't modify while iterating
        text_copy = text

        for match in re.finditer(RE_MD_LINK, text):
            # img tags seem to be renamed to some kind of UUID that avoids issues
            # with filename truncation
            if match["img"]:
                continue

            url = match["url"]

            scheme = urlparse(url.strip()).scheme
            # skip remote urls, this is only for local files
            if scheme:
                continue

            title = match["title"]
            title = title.replace("/", "_")

            # if this doesn't look like a filename with an extension, skip it
            if not RE_FILE_W_EXT.match(title):
                continue

            # mitigate potential issues with filenames that have potential for
            # path traversal
            new_path = Path("./_resources") / title.strip()
            if new_path.resolve().parent != Path("./_resources").resolve():
                continue

            new_path_replacement = f"../{new_path}"

            # new and old are the same, skip
            if new_path_replacement == url:
                continue

            # Escape closing parentheses in the url part
            # Depending on your markdown viewer, it might or might not accept this.
            #
            # Other strategies that are worth considering:
            #   - url encoding the path
            #   - enclosing the url in `<>`
            #
            # I tried `Marked 2`, `Inlyne`, and `MarkText`; unfortunately there was
            # no single strategy that worked in all 3, so I stick with a simple escape
            # with works for `Marked 2`.
            #
            # See also: https://stackoverflow.com/questions/13824669/how-do-you-write-a-link-containing-a-closing-bracket-in-markdown-syntax
            new_path_replacement = new_path_replacement.replace(")", r"\)")

            old_file = Path(url.removeprefix("../"))

            # This title doesn't point to an existing file, skip
            if not old_file.is_file():
                continue

            # join parent directory and rename the potentially truncated old file
            # based on the title found in the markdown link
            # e.g. `../_resources/my file.pd` -> `../_resources/my file.pdf`
            new_file = old_file.parent / title
            old_file.rename(new_file)

            # replace the full markdown link in the text with a new link to the renamed file
            old_link = match.group(0)
            new_link = f"[{title}]({new_path_replacement})"
            text_copy = text_copy.replace(old_link, new_link)
        f.write_text(text_copy)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Quick hack to fix links to local files in Markdown notes exported from Joplin 3.1.24.

	Run at your own risk, ideally on a duplicate copy of your exported notes, as
	this makes destructive changes that would be difficult to undo. This may break
	your notes.

	Further, special cases like parentheses in filenames are handled differently by
	different markdown renderers. You may have to alter the script depending on what
	viewer you're using.

	When Joplin exports notes to markdown, it has special logic any for links to local files.

	- files are placed into a `_resources` directory
	- the filenames are truncated and special characters removed for Windows friendliness: https://github.com/laurent22/joplin/blob/0cec4753d8afaa5673eba526c121d8b1e1062a5d/packages/lib/path-utils.ts#L77
	- the markdown links point to these sanitized filenames
	- the title of the markdown links seems to preserve the original filename

	Example: a joplin note with a link to `my long file.pdf` might turn into this markdown: `[my long file.pdf](my_long_file.pd)`

	On Linux / macOS this provides some inconveniences, as the sanitization
	unfortunately unnecessarily truncates filenames, replaces characters like spaces
	unnecessarily, and corrupts the file extensions, breaking their ability to
	automatically open in default apps.

	This script makes a best effort attempt to reverse this by:

	- recovering the original filename from the link title
	- using this information to rename the file
	- rewriting the markdown link in the note accordingly

	Python 3.13, no other dependencies

	Should be run from the root directory of the joplin export (containing
	`_resources` and `My Notes`)

	See also: https://github.com/laurent22/joplin/issues/11012#issuecomment-2581606039

	"""

	import re
	from pathlib import Path
	from urllib.parse import urlparse

	RE_MD_LINK = re.compile(
	r"""
	(?P<img>!)? # maybe image tag
	\[ # opening bracket
	(?P<title>[^[\]]+?) # the link title e.g. `[title]`
	\]\( # closing bracket and opening parenthesis
	(?P<url>\.\./_resources/[^)]+?) # the link url e.g. `(url)`
	\) # closing parenthesis
	""",
	re.VERBOSE,
	)
	RE_FILE_W_EXT = re.compile(r"^(?P<base>.+)\.(?P<ext>\w{1,3})$")


	def main():
	md_files = Path("My Notes").glob("*.md")
	for f in md_files:
	text = f.read_text()

	# str is immutable; don't modify while iterating
	text_copy = text

	for match in re.finditer(RE_MD_LINK, text):
	# img tags seem to be renamed to some kind of UUID that avoids issues
	# with filename truncation
	if match["img"]:
	continue

	url = match["url"]

	scheme = urlparse(url.strip()).scheme
	# skip remote urls, this is only for local files
	if scheme:
	continue

	title = match["title"]
	title = title.replace("/", "_")

	# if this doesn't look like a filename with an extension, skip it
	if not RE_FILE_W_EXT.match(title):
	continue

	# mitigate potential issues with filenames that have potential for
	# path traversal
	new_path = Path("./_resources") / title.strip()
	if new_path.resolve().parent != Path("./_resources").resolve():
	continue

	new_path_replacement = f"../{new_path}"

	# new and old are the same, skip
	if new_path_replacement == url:
	continue

	# Escape closing parentheses in the url part
	# Depending on your markdown viewer, it might or might not accept this.
	#
	# Other strategies that are worth considering:
	# - url encoding the path
	# - enclosing the url in `<>`
	#
	# I tried `Marked 2`, `Inlyne`, and `MarkText`; unfortunately there was
	# no single strategy that worked in all 3, so I stick with a simple escape
	# with works for `Marked 2`.
	#
	# See also: https://stackoverflow.com/questions/13824669/how-do-you-write-a-link-containing-a-closing-bracket-in-markdown-syntax
	new_path_replacement = new_path_replacement.replace(")", r"\)")

	old_file = Path(url.removeprefix("../"))

	# This title doesn't point to an existing file, skip
	if not old_file.is_file():
	continue

	# join parent directory and rename the potentially truncated old file
	# based on the title found in the markdown link
	# e.g. `../_resources/my file.pd` -> `../_resources/my file.pdf`
	new_file = old_file.parent / title
	old_file.rename(new_file)

	# replace the full markdown link in the text with a new link to the renamed file
	old_link = match.group(0)
	new_link = f"[{title}]({new_path_replacement})"
	text_copy = text_copy.replace(old_link, new_link)
	f.write_text(text_copy)


	if __name__ == "__main__":
	main()