Last active
June 7, 2022 14:37
-
-
Save tectiv3/a0d758a193b7bbac2c849fda1a3bcbf8 to your computer and use it in GitHub Desktop.
Export notes from the OSX Notes.app to separate Markdown files prefixed with the creation timestamp.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
on buildTitle(originalText) | |
set normalizedText to my replace(originalText, ":", "-") | |
set normalizedText to my replace(normalizedText, "|", "") | |
set normalizedText to my replace(normalizedText, "{", "") | |
set normalizedText to my replace(normalizedText, "}", "") | |
set normalizedText to my replace(normalizedText, " ", "_") | |
set normalizedText to my replace(normalizedText, "/", "_") | |
set normalizedText to my replace(normalizedText, "\\", "_") | |
set normalizedText to my replace(normalizedText, "*", "") | |
set normalizedText to my replace(normalizedText, ".", "") | |
set normalizedText to my replace(normalizedText, "<", "_") | |
set normalizedText to my replace(normalizedText, "\"", "_") | |
set finalTitle to my firstChars(normalizedText, 20) | |
return finalTitle | |
end buildTitle | |
on replace(originalText, fromText, toText) | |
set AppleScript's text item delimiters to the fromText | |
set the item_list to every text item of originalText | |
set AppleScript's text item delimiters to the toText | |
set originalText to the item_list as string | |
set AppleScript's text item delimiters to "" | |
return originalText | |
end replace | |
on html2md(original) | |
set original to my replace(original, "<br>", "\n") | |
set md to do shell script "/bin/echo -n " & quoted form of original & " | python html2md.py" | |
return md | |
end html2md | |
on firstChars(originalText, maxChars) | |
if length of originalText is less than maxChars then | |
return originalText | |
else | |
set limitedText to text 1 thru maxChars of originalText | |
return limitedText | |
end if | |
end firstChars | |
on writeToFile(filename, filecontents) | |
set the output to open for access file filename with write permission | |
set eof of the output to 0 | |
write filecontents to the output starting at eof as «class utf8» | |
close access the output | |
end writeToFile | |
on dateTimeToIso(dt) | |
set {year:y, month:m, day:d, hours:h, minutes:min, seconds:s} to dt | |
set y to text 2 through -1 of ((y + 10000) as text) | |
set m to text 2 through -1 of ((m + 100) as text) | |
set d to text 2 through -1 of ((d + 100) as text) | |
set h to text 2 through -1 of ((h + 100) as text) | |
set min to text 2 through -1 of ((min + 100) as text) | |
set s to text 2 through -1 of ((s + 100) as text) | |
return y & "-" & m & "-" & d & "_" & h & "_" & min | |
end dateTimeToIso | |
tell application "Notes" | |
activate | |
set exportFolder to POSIX path of (choose folder with prompt "Please choose the export folder") as string | |
set exportFolder to (POSIX file exportFolder) as string | |
set counter to 0 | |
repeat with each in every note | |
set noteName to name of each | |
set noteBody to body of each | |
set noteMD to my html2md(noteBody) | |
set noteTitle to my buildTitle(noteName) | |
set creationDate to creation date of each | |
set timeStamp to my dateTimeToIso(creationDate) as string | |
set counter to counter + 1 | |
set filename to ((exportFolder as string) & timeStamp & "_" & noteTitle & ".md") | |
my writeToFile(filename, noteMD as text) | |
end repeat | |
end tell |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""html2text: Turn HTML into equivalent Markdown-structured text.""" | |
__version__ = "2.39" | |
__author__ = "Aaron Swartz ([email protected])" | |
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." | |
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] | |
# TODO: | |
# Support decoded entities with unifiable. | |
if not hasattr(__builtins__, 'True'): True, False = 1, 0 | |
import re, sys, urllib, htmlentitydefs, codecs, StringIO, types | |
import sgmllib | |
import urlparse | |
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') | |
try: from textwrap import wrap | |
except: pass | |
# Use Unicode characters instead of their ascii psuedo-replacements | |
UNICODE_SNOB = 0 | |
# Put the links after each paragraph instead of at the end. | |
LINKS_EACH_PARAGRAPH = 1 | |
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) | |
BODY_WIDTH = 0 | |
# Don't show internal links (href="#local-anchor") -- corresponding link targets | |
# won't be visible in the plain text file anyway. | |
SKIP_INTERNAL_LINKS = True | |
### Entity Nonsense ### | |
def name2cp(k): | |
if k == 'apos': return ord("'") | |
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 | |
return htmlentitydefs.name2codepoint[k] | |
else: | |
k = htmlentitydefs.entitydefs[k] | |
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 | |
return ord(codecs.latin_1_decode(k)[0]) | |
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', | |
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', | |
'ndash':'-', 'oelig':'oe', 'aelig':'ae', | |
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', | |
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', | |
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', | |
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', | |
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'} | |
unifiable_n = {} | |
for k in unifiable.keys(): | |
unifiable_n[name2cp(k)] = unifiable[k] | |
def charref(name): | |
if name[0] in ['x','X']: | |
c = int(name[1:], 16) | |
else: | |
c = int(name) | |
if not UNICODE_SNOB and c in unifiable_n.keys(): | |
return unifiable_n[c] | |
else: | |
return unichr(c) | |
def entityref(c): | |
if not UNICODE_SNOB and c in unifiable.keys(): | |
return unifiable[c] | |
else: | |
try: name2cp(c) | |
except KeyError: return "&" + c | |
else: return unichr(name2cp(c)) | |
def replaceEntities(s): | |
s = s.group(1) | |
if s[0] == "#": | |
return charref(s[1:]) | |
else: return entityref(s) | |
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") | |
def unescape(s): | |
return r_unescape.sub(replaceEntities, s) | |
def fixattrs(attrs): | |
# Fix bug in sgmllib.py | |
if not attrs: return attrs | |
newattrs = [] | |
for attr in attrs: | |
newattrs.append((attr[0], unescape(attr[1]))) | |
return newattrs | |
### End Entity Nonsense ### | |
def onlywhite(line): | |
"""Return true if the line does only consist of whitespace characters.""" | |
for c in line: | |
if c is not ' ' and c is not ' ': | |
return c is ' ' | |
return line | |
def optwrap(text): | |
"""Wrap all paragraphs in the provided text.""" | |
if not BODY_WIDTH: | |
return text | |
assert wrap, "Requires Python 2.3." | |
result = '' | |
newlines = 0 | |
for para in text.split("\n"): | |
if len(para) > 0: | |
if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*': | |
for line in wrap(para, BODY_WIDTH): | |
result += line + "\n" | |
result += "\n" | |
newlines = 2 | |
else: | |
if not onlywhite(para): | |
result += para + "\n" | |
newlines = 1 | |
else: | |
if newlines < 2: | |
result += "\n" | |
newlines += 1 | |
return result | |
def hn(tag): | |
if tag[0] == 'h' and len(tag) == 2: | |
try: | |
n = int(tag[1]) | |
if n in range(1, 10): return n | |
except ValueError: return 0 | |
class _html2text(sgmllib.SGMLParser): | |
def __init__(self, out=None, baseurl=''): | |
sgmllib.SGMLParser.__init__(self) | |
if out is None: self.out = self.outtextf | |
else: self.out = out | |
self.outtext = u'' | |
self.quiet = 0 | |
self.p_p = 0 | |
self.outcount = 0 | |
self.start = 1 | |
self.space = 0 | |
self.a = [] | |
self.astack = [] | |
self.acount = 0 | |
self.list = [] | |
self.blockquote = 0 | |
self.pre = 0 | |
self.startpre = 0 | |
self.lastWasNL = 0 | |
self.abbr_title = None # current abbreviation definition | |
self.abbr_data = None # last inner HTML (for abbr being defined) | |
self.abbr_list = {} # stack of abbreviations to write later | |
self.baseurl = baseurl | |
def outtextf(self, s): | |
self.outtext += s | |
def close(self): | |
sgmllib.SGMLParser.close(self) | |
self.pbr() | |
self.o('', 0, 'end') | |
return self.outtext | |
def handle_charref(self, c): | |
self.o(charref(c)) | |
def handle_entityref(self, c): | |
self.o(entityref(c)) | |
def unknown_starttag(self, tag, attrs): | |
self.handle_tag(tag, attrs, 1) | |
def unknown_endtag(self, tag): | |
self.handle_tag(tag, None, 0) | |
def previousIndex(self, attrs): | |
""" returns the index of certain set of attributes (of a link) in the | |
self.a list | |
If the set of attributes is not found, returns None | |
""" | |
if not attrs.has_key('href'): return None | |
i = -1 | |
for a in self.a: | |
i += 1 | |
match = 0 | |
if a.has_key('href') and a['href'] == attrs['href']: | |
if a.has_key('title') or attrs.has_key('title'): | |
if (a.has_key('title') and attrs.has_key('title') and | |
a['title'] == attrs['title']): | |
match = True | |
else: | |
match = True | |
if match: return i | |
def handle_tag(self, tag, attrs, start): | |
attrs = fixattrs(attrs) | |
if hn(tag): | |
self.p() | |
if start: self.o(hn(tag)*"#" + ' ') | |
if tag in ['p', 'div']: self.p() | |
if tag == "br" and start: self.o(" \n") | |
if tag == "hr" and start: | |
self.p() | |
self.o("* * *") | |
self.p() | |
if tag in ["head", "style", 'script']: | |
if start: self.quiet += 1 | |
else: self.quiet -= 1 | |
if tag in ["body"]: | |
self.quiet = 0 # sites like 9rules.com never close <head> | |
if tag == "blockquote": | |
if start: | |
self.p(); self.o('> ', 0, 1); self.start = 1 | |
self.blockquote += 1 | |
else: | |
self.blockquote -= 1 | |
self.p() | |
if tag in ['em', 'i', 'u']: self.o("_") | |
if tag in ['strong', 'b']: self.o("**") | |
if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` `` | |
if tag == "abbr": | |
if start: | |
attrsD = {} | |
for (x, y) in attrs: attrsD[x] = y | |
attrs = attrsD | |
self.abbr_title = None | |
self.abbr_data = '' | |
if attrs.has_key('title'): | |
self.abbr_title = attrs['title'] | |
else: | |
if self.abbr_title != None: | |
self.abbr_list[self.abbr_data] = self.abbr_title | |
self.abbr_title = None | |
self.abbr_data = '' | |
if tag == "a": | |
if start: | |
attrsD = {} | |
for (x, y) in attrs: attrsD[x] = y | |
attrs = attrsD | |
if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): | |
self.astack.append(attrs) | |
self.o("[") | |
else: | |
self.astack.append(None) | |
else: | |
if self.astack: | |
a = self.astack.pop() | |
if a: | |
i = self.previousIndex(a) | |
if i is not None: | |
a = self.a[i] | |
else: | |
self.acount += 1 | |
a['count'] = self.acount | |
a['outcount'] = self.outcount | |
self.a.append(a) | |
self.o("][" + `a['count']` + "]") | |
if tag == "img" and start: | |
attrsD = {} | |
for (x, y) in attrs: attrsD[x] = y | |
attrs = attrsD | |
if attrs.has_key('src'): | |
attrs['href'] = attrs['src'] | |
alt = attrs.get('alt', '') | |
i = self.previousIndex(attrs) | |
if i is not None: | |
attrs = self.a[i] | |
else: | |
self.acount += 1 | |
attrs['count'] = self.acount | |
attrs['outcount'] = self.outcount | |
self.a.append(attrs) | |
self.o("![") | |
self.o(alt) | |
self.o("]["+`attrs['count']`+"]") | |
if tag == 'dl' and start: self.p() | |
if tag == 'dt' and not start: self.pbr() | |
if tag == 'dd' and start: self.o(' ') | |
if tag == 'dd' and not start: self.pbr() | |
if tag in ["ol", "ul"]: | |
if start: | |
self.list.append({'name':tag, 'num':0}) | |
else: | |
if self.list: self.list.pop() | |
self.p() | |
if tag == 'li': | |
if start: | |
self.pbr() | |
if self.list: li = self.list[-1] | |
else: li = {'name':'ul', 'num':0} | |
self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly. | |
if li['name'] == "ul": self.o("* ") | |
elif li['name'] == "ol": | |
li['num'] += 1 | |
self.o(`li['num']`+". ") | |
self.start = 1 | |
else: | |
self.pbr() | |
if tag in ["table", "tr"] and start: self.p() | |
if tag == 'td': self.pbr() | |
if tag == "pre": | |
if start: | |
self.startpre = 1 | |
self.pre = 1 | |
else: | |
self.pre = 0 | |
self.p() | |
def pbr(self): | |
if self.p_p == 0: self.p_p = 1 | |
def p(self): self.p_p = 2 | |
def o(self, data, puredata=0, force=0): | |
if self.abbr_data is not None: self.abbr_data += data | |
if not self.quiet: | |
if puredata and not self.pre: | |
data = re.sub('\s+', ' ', data) | |
if data and data[0] == ' ': | |
self.space = 1 | |
data = data[1:] | |
if not data and not force: return | |
if self.startpre: | |
#self.out(" :") #TODO: not output when already one there | |
self.startpre = 0 | |
bq = (">" * self.blockquote) | |
if not (force and data and data[0] == ">") and self.blockquote: bq += " " | |
if self.pre: | |
bq += " " | |
data = data.replace("\n", "\n"+bq) | |
if self.start: | |
self.space = 0 | |
self.p_p = 0 | |
self.start = 0 | |
if force == 'end': | |
# It's the end. | |
self.p_p = 0 | |
self.out("\n") | |
self.space = 0 | |
if self.p_p: | |
self.out(('\n'+bq)*self.p_p) | |
self.space = 0 | |
if self.space: | |
if not self.lastWasNL: self.out(' ') | |
self.space = 0 | |
if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"): | |
if force == "end": self.out("\n") | |
newa = [] | |
for link in self.a: | |
if self.outcount > link['outcount']: | |
self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href'])) | |
if link.has_key('title'): self.out(" ("+link['title']+")") | |
self.out("\n") | |
else: | |
newa.append(link) | |
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. | |
self.a = newa | |
if self.abbr_list and force == "end": | |
for abbr, definition in self.abbr_list.items(): | |
self.out(" *[" + abbr + "]: " + definition + "\n") | |
self.p_p = 0 | |
self.out(data) | |
self.lastWasNL = data and data[-1] == '\n' | |
self.outcount += 1 | |
def handle_data(self, data): | |
if r'\/script>' in data: self.quiet -= 1 | |
self.o(data, 1) | |
def unknown_decl(self, data): pass | |
def wrapwrite(text): sys.stdout.write(text.encode('utf8')) | |
def html2text_file(html, out=wrapwrite, baseurl=''): | |
h = _html2text(out, baseurl) | |
h.feed(html) | |
h.feed("") | |
return h.close() | |
def html2text(html, baseurl=''): | |
return optwrap(html2text_file(html, None, baseurl)) | |
if __name__ == "__main__": | |
baseurl = '' | |
data = sys.stdin.read().decode('utf8') | |
sys.stdout.write(html2text(data, baseurl).encode('utf8')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment