Last active
September 19, 2024 17:03
-
-
Save jheddings/80df4f3acaa0f52ea9523be093341f46 to your computer and use it in GitHub Desktop.
Import Apple Notes into Notion.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# !! NOTE - this script is no longer maintained... please see the repo for further | |
# updates: https://github.com/jheddings/notes2notion | |
# this script attempts to migrate from Apple Notes to Notion while retaining as | |
# much information and formatting as possible. there are limitations to the | |
# export data from Notes, so we try to preserve the intent of the original note. | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | |
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A | |
# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | |
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
import os | |
import sys | |
import re | |
import yaml | |
from notion.client import NotionClient | |
from notion.block import PageBlock, TextBlock, CodeBlock, ImageBlock | |
from notion.block import HeaderBlock, SubheaderBlock, SubsubheaderBlock | |
from notion.block import BulletedListBlock, NumberedListBlock | |
from notion.block import CollectionViewBlock, DividerBlock, QuoteBlock | |
try: | |
from yaml import CLoader as Loader, CDumper as Dumper | |
except ImportError: | |
from yaml import Loader, Dumper | |
################################################################################ | |
## CONFIGURATION | |
# set this to your current token_v2 cookie (use an inspector in your browser to obtain) | |
notion_token_v2 = 'PUT_YOUR_TOKEN_V2_HERE' | |
# set this to the top-level page for the import - all notes will be added as sub pages | |
import_page_url = 'ARCHIVE_PAGE_URL' | |
# if this is set, the script will log progress to this database. additionally, | |
# the script will consider status in the log before uploading a document again | |
# | |
# the database uses the following schema: | |
# Name (title) -> name of the note | |
# Note ID (text) -> the original note ID | |
# Status [Pending, Failed, Finished] -> current status of the migration | |
# Page (URL) -> the link to the imported note | |
# Timestamp (Creation date) [optional] -> date/time of the log entry | |
# | |
# set to None to disable this feature | |
import_log_url = None | |
# by default, this script will skip the first Title line in the note | |
skip_title = True | |
# include raw note metadata in the Notion import | |
include_meta = True | |
# include the raw note HTML in the Notion import - note that this can cause problems | |
# if the notes include pictures, since they are encoded directly in the HTML | |
include_html = False | |
# this maps the HTML element from Notes to a Notion block type | |
block_map = { | |
'h1' : HeaderBlock, | |
'h2' : SubheaderBlock, | |
'h3' : SubsubheaderBlock, | |
'tt' : CodeBlock, | |
'pre' : CodeBlock, | |
'ul' : BulletedListBlock, | |
'ol' : NumberedListBlock | |
} | |
################################################################################ | |
def notes_to_notion(html, page): | |
from bs4 import BeautifulSoup | |
soup = BeautifulSoup(html, 'html.parser') | |
# Apple Notes exports pretty basic HTML... | |
# there is no html, head or body wrapper. | |
for elem in soup.children: | |
if elem.name is None: continue | |
# let append_* methods do the heavy lifting | |
if elem.name == 'div': | |
append_block(page, elem) | |
# handle lists separately | |
elif elem.name == 'ul' or elem.name == 'ol': | |
append_list(page, elem) | |
else: | |
print(f'-- UNKNOWN BLOCK: {elem.name}') | |
################################################################################ | |
# Notion supports inline markdown for common formatting... | |
def markup_text(tag, text): | |
# bold text | |
if tag == 'b' or tag == 'strong': | |
return '**' + text + '**' | |
# italics | |
elif tag == 'i' or tag == 'em': | |
return '*' + text + '*' | |
# strike-through text | |
elif tag == 'strike': | |
return '~~' + text + '~~' | |
# standard links | |
elif tag == 'a': | |
return '<' + text + '>' | |
# underline - not supported in markdown | |
#elif tag == 'u': | |
return text | |
################################################################################ | |
def get_block_text(block): | |
# no-name blocks are just strings... | |
if block.name is None: | |
return str(block) | |
# otherwise, iterate over the text in the child elements | |
# we could use this method to do additional processing on the text | |
# e.g. we could look for things that look like URL's and make links | |
# e.g. we could look for lines that start with '-' and make lists | |
strings = list() | |
for child in block.children: | |
string = get_block_text(child) | |
if string is None: continue | |
if len(string) == 0: continue | |
strings.append(string.strip()) | |
text = ' '.join(strings) | |
return markup_text(block.name, text) | |
################################################################################ | |
def build_schema(thead): | |
schema = dict() | |
for idx, td in enumerate(thead): | |
col_id = f'c{idx}' | |
col_schema = { | |
'name' : td, | |
'type' : 'text' | |
} | |
# treat the first column differently | |
if idx == 0: | |
col_id = 'title' | |
col_schema['type'] = 'title' | |
schema[col_id] = col_schema | |
return schema | |
################################################################################ | |
def append_block(page, elem): | |
if elem is None: return None | |
#print(f'BLOCK: {elem.name}') | |
# there may be more than one image in a block | |
imgs = elem('img', recursive=False) | |
if imgs: return append_imgs(page, imgs) | |
# handle objects (like tables) | |
objs = elem('object', recursive=False) | |
if objs: return append_objects(page, objs) | |
# most of our decisions will be based on the first child of the block... | |
first_child = next(elem.children) | |
if first_child.name == 'h1': | |
# if this is the first child on the page, assume it is the title | |
if skip_title and len(page.children) == 0: | |
return None | |
# handle remaining elements as direct mapping to Notion blocks | |
# assume any unknown block types are text and "do our best" | |
block_type = block_map.get(first_child.name, TextBlock) | |
text = get_block_text(elem) | |
if text is None or len(text) == 0: | |
return None | |
return page.children.add_new(block_type, title=text) | |
################################################################################ | |
def append_text(page, elem, markup=None): | |
text = get_block_text(elem) | |
if text is None: return None | |
if markup is not None: | |
text = markup + text + markup | |
return page.children.add_new(TextBlock, title=text) | |
################################################################################ | |
def append_list(page, list_elem): | |
block_type = block_map.get(list_elem.name, None) | |
if block_type is None: | |
print(f'-- Unknown list type - {list_elem.name}') | |
return None | |
for li in list_elem.find_all('li', recursive=False): | |
text = get_block_text(li) | |
page.children.add_new(block_type, title=text) | |
# TODO return list of blocks | |
return True | |
################################################################################ | |
def append_objects(page, objs): | |
for obj in objs: | |
append_object(page, obj) | |
return True | |
################################################################################ | |
def append_object(page, elem): | |
block = next(elem.children) | |
if block.name == 'table': | |
return append_table(page, block) | |
print(f'-- Unsupported object: {block.name}') | |
return None | |
################################################################################ | |
# FIXME this is my least favorite part of the script... | |
def append_table(page, table): | |
global client | |
# XXX it would make more sense if Notion supported basic markdown tables | |
# instead, we have to build a collection view to capture the table data | |
block = page.children.add_new(CollectionViewBlock) | |
# does Apple ever set a header? I don't think so... | |
# XXX maybe we want a flag to use the first table row as a header or not? | |
thead = None | |
tbody = table.find('tbody') | |
for tr in tbody.find_all('tr', recursive=False): | |
# if no header was provided, we will build it from this row... | |
if thead is None: | |
thead = list() | |
# if we have a header, but no Collection (yet) | |
elif block.collection is None: | |
schema = build_schema(thead) | |
block.collection = client.get_collection( | |
client.create_record("collection", parent=block, schema=schema) | |
) | |
# we need a new view to see our lovely table... | |
block.views.add_new(view_type='table') | |
# if we have a valid collection, add data directly to rows | |
row = None if block.collection is None else block.collection.add_row() | |
# start processing the column data... | |
tds = tr.find_all('td', recursive=False) | |
for idx, td in enumerate(tds): | |
text = get_block_text(td) | |
col_id = 'title' if idx == 0 else f'c{idx}' | |
if block.collection is None: | |
thead.append(text) | |
if row is not None and text is not None: | |
row.set_property(col_id, text) | |
return block | |
################################################################################ | |
def append_imgs(page, imgs): | |
for img in imgs: | |
append_img(page, img) | |
return True | |
################################################################################ | |
img_data_re = re.compile('^data:image/([^;]+);([^,]+),(.+)$') | |
img_http_re = re.compile('^https?://(.+)$') | |
def append_img(page, img_elem): | |
import base64 | |
import tempfile | |
# Notes uses embedded images... we need to extract the image, upload it | |
# and reference it in the block | |
# TODO this probably needs more error handling and better flow | |
img_src = img_elem['src'] | |
m = img_data_re.match(img_src) | |
if m is None: | |
print(f'-- Unsupported img type:') | |
return None | |
img_type = m.groups()[0] | |
img_data_enc = m.groups()[1] | |
img_data_str = m.groups()[2] | |
img_data = None | |
if img_data_enc == 'base64': | |
img_data_b64 = img_data_str.encode('ascii') | |
img_data = base64.b64decode(img_data_b64) | |
else: | |
print(f'-- Unsupported img encoding: {img_data_enc}') | |
return None | |
block = None | |
with tempfile.NamedTemporaryFile(suffix=f'.{img_type}') as fp: | |
fp.write(img_data) | |
# upload the image to Notion | |
block = page.children.add_new(ImageBlock) | |
try: | |
block.upload_file(fp.name) | |
except Exception: | |
print('!! UPLOAD FAILED') | |
return block | |
################################################################################ | |
def tell_notes(*args): | |
import applescript | |
script = "\n".join(args) | |
res = applescript.tell.app('Notes', script) | |
if res.code != 0: | |
print(f'!! ERROR - {res.err}') | |
return None | |
# do some basic string to type mapping... | |
if res.out == 'null': return None | |
if res.out == 'false': return False | |
if res.out == 'true': return True | |
if len(res.out) == 0: return None | |
return res.out | |
################################################################################ | |
def get_note(note_id): | |
# to get the data from Notes, we will get a dump from AppleScript | |
# as YAML that we can turn back into a Python object | |
text = tell_notes( | |
# there is no direct way to get a note from AppleScript using the ID... | |
# so we have to loop over all notes and look for the right one. | |
'repeat with theNote in notes of default account', | |
'set noteID to id of theNote as string', | |
# the note ID is a full CoreData URL... we only want the pXXXX part | |
f'if noteID ends with "/{note_id}" then', | |
# determine the the Notes folder | |
# TODO get the full folder path | |
'set folderName to ""', | |
'set theContainer to container of theNote', | |
'if theContainer is not missing value', | |
'set folderName to (name of theContainer) & "/" & folderName', | |
'end if', | |
# "export" the note data when we find it... | |
'set noteMeta to "meta:" ¬', | |
' & "\n id: " & quoted form of (id of theNote as string) ¬', | |
' & "\n name: " & quoted form of (name of theNote as string) ¬', | |
' & "\n folder: " & quoted form of folderName ¬', | |
' & "\n creation_date: \\"" & (creation date of theNote as date) & "\\"" ¬', | |
' & "\n modification_date: \\"" & (modification date of theNote as date) & "\\"" ¬', | |
' & "\n locked: " & (password protected of theNote as boolean) ¬', | |
' & "\n shared: " & (shared of theNote as boolean) ¬', | |
' & "\nattachments:"', | |
# FIXME some attachments (like embedded documenta) are causing problems... | |
#'repeat with theAttachment in attachments of theNote', | |
# 'set noteMeta to noteMeta & "\n - id: " & (id of theAttachment) ¬', | |
# ' & "\n name: " & (name of theAttachment) ¬', | |
# ' & "\n ref: " & (content identifier of theAttachment) ¬', | |
# ' & "\n creation_date: " & (creation date of theAttachment as date) ¬', | |
# ' & "\n modification_date: " & (modification date of theAttachment as date) ¬', | |
# ' & "\n url: " & (url of theAttachment)', | |
#'end repeat', | |
'return noteMeta & "\n---\n" & (body of theNote as string)', | |
'end if', | |
'end repeat' | |
) | |
#print(text) | |
if text is None: return None | |
# parse the output from AppleScript into a Python object... | |
(text_meta, text_body) = text.split('---', maxsplit=1) | |
note = yaml.load(text_meta, Loader=Loader) | |
note['body'] = text_body.strip() | |
#print(yaml.dump(note)) | |
return note | |
################################################################################ | |
def get_log_filter(note_id, status): | |
return { 'filters': | |
[{ | |
'property': 'note_id', | |
'filter': { | |
'operator': 'string_is', | |
'value': { | |
'type': 'exact', | |
'value': note_id | |
} | |
} | |
}, | |
{ | |
'property': 'status', | |
'filter': { | |
'operator': 'enum_is', | |
'value': { | |
'type': 'exact', | |
'value': status | |
} | |
} | |
}], 'operator': 'and' | |
} | |
################################################################################ | |
def get_log_entry(note_id, status): | |
global import_log | |
if import_log is None: return None | |
filter_params = get_log_filter(note_id, 'Finished') | |
result = import_log.query(filter=filter_params) | |
return None if len(result) == 0 else result[0] | |
################################################################################ | |
## MAIN ENTRY | |
# since note data can get very large, we will extract one note at a time | |
# the 'notes' object serializes as a list of Core Data URL's... | |
notes_raw = tell_notes('return notes of default account') | |
note_links = re.split(r', *', notes_raw) | |
all_notes = [ re.sub(r'^.*/(p[0-9]+)', r'\1', link) for link in note_links] | |
client = NotionClient(token_v2=notion_token_v2) | |
archive = client.get_block(import_page_url) | |
import_log = None | |
if import_log_url is not None: | |
import_log_view = client.get_collection_view(import_log_url) | |
import_log = import_log_view.collection | |
# load each note and upload to Notion | |
for note_id in all_notes: | |
# look for an existing 'Finished' entry | |
log = get_log_entry(note_id, 'Finished') | |
if log is not None: | |
print(f'{log.name} (Finished)') | |
continue | |
# prepare to import... | |
note = get_note(note_id) | |
if note is not None: | |
note_meta = note['meta'] | |
note_name = note_meta['name'] | |
# skip locked notes | |
if note_meta['locked']: | |
continue | |
print(f'{note_name} [{note_id}]') | |
# set up the log entry if needed | |
log = None if import_log is None else import_log.add_row( | |
name=note_name, status='Pending', note_id=note_id | |
) | |
# create a stubbed page for the import | |
# TODO support the folder heirarchy from the note | |
page = archive.children.add_new(PageBlock, title=note_name) | |
if log is not None: log.page = page.get_browseable_url() | |
html = note['body'] | |
notes_to_notion(html, page) | |
# TODO upload attachments | |
if include_meta or include_html: | |
page.children.add_new(DividerBlock) | |
if include_meta: | |
meta_text = yaml.dump(note_meta) | |
page.children.add_new(CodeBlock, title=meta_text, language='yaml') | |
if include_html: | |
page.children.add_new(CodeBlock, title=html, language='html') | |
# finally, mark the page as uploaded... | |
if log is not None: log.status = 'Finished' |
And thank you for doing it
Does this work?
This has been replaced by https://github.com/jheddings/notes2notion
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Does this work?