Last active
January 13, 2025 18:54
-
-
Save ttscoff/96828c8f9695849e3a84dbd46414676f to your computer and use it in GitHub Desktop.
Linkding to Markdown archiving script with webarchive generation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Archive linkding bookmarks to Markdown files | |
# Can use [Gather](https://brettterpstra.com/projects/gather-cli/) | |
# for conversion (if installed), or use Marky | |
# the Markdownifier (web-based). | |
# | |
# See options below for configuration | |
# | |
# This script is designed to run once initially, and then | |
# be set to run in the background at intervals using cron | |
# or launchd. | |
# | |
# The script can be run as root if needed, and will | |
# appropriately set permissions on generated files to make | |
# them accessible by your user (see config). | |
# | |
# You can specify a certain tag to only archive bookmarks | |
# containing that tag. This is useful because if you're | |
# just linking a tool or interesting app, you probably | |
# don't need a Markdown archive of it. Tagging allows | |
# selective and intentional archiving. But if you leave it | |
# blank, the script will just archive all bookmarks. | |
# | |
# Bookmarks will be pulled and markdownified using either | |
# [Gather](https://brettterpstra.com/projects/gather-cli/) | |
# (Mac only) or the web based [Marky the | |
# Markdownifier](https://markdownrules.com). This can be | |
# changed in the configuration. | |
# | |
# If you want to ensure future access to images, set | |
# `localize_images` to true and any accessible linked | |
# images in an article will be downloaded to your local | |
# machine. | |
# | |
# To use the script, you'll need Ruby available. Install | |
# with a package manager or version manager if you don't have | |
# it. | |
# | |
# Save the script in your PATH and make it | |
# executable with `chmod a+x linkding.rb`. Modify the | |
# options section at the top with your server name, API key | |
# (https://example.com/settings/integrations), and set the | |
# various options. Run it once from the command line to | |
# test and to archive initial existing bookmarks with the | |
# specified tag. Once run successfully, you can then set up | |
# a cron job or launchd agent to run in the background at | |
# intervals (I recommend at least a 5 minute interval). | |
# | |
# MIT license, copyright Brett Terpstra 2024 | |
# | |
%w[time cgi fileutils json erb shellwords English].each do |filename| | |
require filename | |
end | |
# Configuration | |
options = { | |
server: 'https://example.com', # your linkding install | |
api_key: 'xxxxxxxxx', # https://your_install/settings/integrations | |
note_tag: '.archive', # leave empty to archive all bookmarks, also creates webarchie if configured | |
bookmark_tag: '.webloc', # leave empty to create weblocs for all bookmarks | |
notes_folder: '/Users/ttscoff/Dropbox/LinkDing/', # absolute path to folder to save notes | |
localize_images: false, # if true, download images to notes folder | |
images_subfolder: 'images', # leave empty for no subfolder | |
user: 'ttscoff:staff', # for setting permissions when run as root | |
tool: :marky, # set to :gather or :marky | |
gather_path: '/usr/local/bin/gather', | |
save_web_archive: true, | |
web_archiver_path: '/usr/local/bin/save_safari_webarchive' | |
} | |
module Util | |
class << self | |
# Run chown on a file to set owner | |
# | |
# @param file [String] path to file | |
# | |
def chown(file, user = nil) | |
return unless user | |
`chown #{user} "#{file}"` | |
end | |
# Run chmod on a file to set standard permissions | |
# (755 for dirs, 644 for files) | |
# | |
# @param file [String] path to file | |
# | |
def chmod(file) | |
if File.directory?(file) | |
`chmod 755 "#{file}"` | |
else | |
`chmod 644 "#{file}"` | |
end | |
end | |
# Combo method for setting owner and permissions | |
# | |
# @param file [String] path to file | |
# @param user [String] user and optional group to set on file | |
# | |
def permissions(file, user = nil) | |
return unless user | |
chown(file, user) | |
chmod(file) | |
end | |
end | |
end | |
class ::String | |
# Indent every line of a string a given | |
# number of spaces | |
# | |
# @param distance [Integer] number of spaces | |
# | |
def indent(distance = 2) | |
indent = ' ' * distance | |
indent + gsub(/\n/, "\n#{indent}").rstrip | |
end | |
def sanitize | |
gsub(%r{/}, ':').gsub(/\|/, '-').gsub(/#/, 'hash ').gsub(/[^:\-a-z0-9 _,.?!]/i, '') | |
end | |
# | |
# Discard invalid characters and output a UTF-8 String | |
# | |
# @return [String] UTF-8 encoded string | |
# | |
def scrub | |
encode('utf-16', invalid: :replace).encode('utf-8') | |
end | |
# | |
# Destructive version of #utf8 | |
# | |
# @return [String] UTF-8 encoded string, in place | |
# | |
def scrub! | |
replace scrub | |
end | |
# | |
# Method to save linked images to local images | |
# | |
# @param options [Hash] hash of options | |
# | |
# @option options [String] :notes_folder path to notes folder | |
# @option options [String] :images_subfolder subfolder to save to (blank for root) | |
# @option options [String] :user user and optional group to set for permissions | |
# | |
def localize_images(options) | |
folder = options[:notes_folder] | |
subfolder = options[:images_subfolder] | |
user = options[:user] | |
begin | |
unless subfolder.nil? || subfolder.strip.empty? | |
folder = File.join(folder, subfolder) | |
FileUtils.mkdir_p(folder) unless File.directory?(folder) | |
end | |
gsub(%r{(?<=:\s|\()https?://[^ ]+/([^/ ]+\.(?:png|jpe?g|gif|pdf|avif|webp|mov|ogg|mp4))(?:\?.*?)?(?=\s|\)|"|$)}i) do | |
m = Regexp.last_match | |
img = m[0].strip | |
image_name = m[1].strip | |
target = File.join(folder, image_name) | |
unless File.exist?(target) | |
puts "๐ป Downloading image #{image_name}" | |
puts `curl -SsL -o "#{target}" "#{img}"` | |
Util.permissions(target, user) | |
end | |
subfolder.empty? ? image_name : "#{subfolder}/#{image_name}" | |
end | |
rescue StandardError => e | |
puts 'Failed to localize images' | |
puts e | |
puts e.backtrace | |
self | |
end | |
end | |
# Destructive version of #localize_images | |
def localize_images!(options) | |
replace localize_images(options) | |
end | |
# Some fixes for content created by Gather | |
# Removes empty lines, fixes weird self-links | |
def gather_fix | |
empties = [] | |
out = strip.scrub | |
out.gsub!(/^(\s*\n){2,}/, "\n\n") | |
out.gsub!(/(?<!\[)(<.*?>)\](\[\d+\])/) do | |
m = Regexp.last_match | |
empties << m[2] | |
m[1] | |
end | |
empties.each do |e| | |
out.gsub!(/^\s*#{Regexp.escape(e)}: .*?$/, '') | |
end | |
out | |
end | |
# destructive version of #gather_fix | |
def gather_fix! | |
replace gather_fix | |
end | |
# Some fixes for Marky output | |
# Removes metadata to be replaced by this script's metadata | |
def marky_fix | |
content = strip.scrub | |
content.gsub!(/^(date|title|tags|source|description):.*?\n/, '') | |
content.strip | |
end | |
# destructive version of #marky_fix | |
def marky_fix! | |
replace marky_fix | |
end | |
end | |
class Linkding | |
# Initialize a new instance | |
# | |
# @param options [Hash] hash of options | |
# | |
# @option options [String] :server Linkding server | |
# @option options [String] :api_key API key | |
# @option options [String] :note_tag Tag to archive | |
# @option options [String] :notes_folder Folder to save to | |
# @option options [Boolean] :localize_images Download images | |
# @option options [String] :images_subfolder Subfolder to download images to | |
# @option options [String] :user user[:group] to apply permissions | |
# @option options [Symbol] :tool Tool to use for markdownifying (:gather or :marky) | |
# | |
def initialize(options = {}) | |
return self if options.empty? | |
@options = options | |
end | |
# retrieves the JSON output from the Linkding API | |
# | |
# @param api_call [String] the API path to call | |
# | |
# @return [Hash] Converted hash of output | |
def get_json(api_call) | |
JSON.parse(`curl -SsL -H 'Authorization: Token #{@options[:api_key]}' '#{@options[:server]}#{api_call}'`) | |
end | |
# compares bookmark array to existing bookmarks to find new urls | |
# | |
# @return [Array] array of unsaved bookmarks | |
# | |
def new_bookmarks(tag, extension = 'md') | |
search = "&q=%23#{tag}" if tag && !tag.empty? | |
call = "/api/bookmarks/?limit=1000&format=json#{search}" | |
json = get_json(call) | |
bookmarks = json['results'] | |
offset = 0 | |
while json['next'] | |
offset += 1 | |
json = get_json(call + "&offset=#{offset}") | |
bookmarks.concat(json['results']) | |
end | |
existing_files = Dir.glob("*.#{extension}", base: @options[:notes_folder]) | |
unless existing_files.empty? | |
bookmarks.reject! do |s| | |
existing_files.include? "#{s['title'].sanitize}.#{extension}" | |
end | |
end | |
bookmarks | |
end | |
# Test if URL result is meta redirect | |
# | |
# @return [String] final url after following redirects | |
# | |
def redirect?(url) | |
content = `curl -SsL "#{url}"`.scrub | |
url = redirect?(Regexp.last_match(1)) if content =~ /meta http-equiv=["']refresh["'].*?url=(.*?)["']/ | |
url | |
end | |
# markdownify url with Marky the Markdownifier | |
# | |
# @param url [String] URL to markdownify | |
# | |
# @return [String] markdown content | |
# | |
def marky(url) | |
url = redirect?(url) | |
call = %(https://heckyesmarkdown.com/api/2/?url=#{CGI.escape(url)}&readability=1) | |
`curl -SsL '#{call}'`.marky_fix | |
end | |
# markdownify url with Gather | |
# | |
# @param url [String] url to markdownify | |
# | |
# @return [String] markdown content | |
# | |
def gather(url) | |
url = redirect?(url) | |
`#{@options[:gather_path]} "#{url}"`.gather_fix | |
end | |
def to_webloc(url, title, path) | |
filename = File.join(File.expand_path(path), "#{title}.webloc") | |
return false if File.exist?(filename) | |
File.open(filename, 'w') do |f| | |
f.puts <<~EOXML | |
<?xml version="1.0" encoding="UTF-8"?> | |
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd"> | |
<plist version="1.0"> | |
<dict> | |
<key>URL</key> | |
<string>#{url}</string> | |
</dict> | |
</plist> | |
EOXML | |
end | |
filename | |
end | |
def to_webarchive(url, path) | |
@options[:web_archiver_path] = '/usr/local/bin/save_safari_webarchive' unless @options[:web_archiver_path] | |
url = redirect?(url) | |
`#{@options[:web_archiver_path]} "#{url}" "#{path}" &> /dev/null` | |
$CHILD_STATUS.success? | |
end | |
end | |
def apply_tags(file, tag_array) | |
return false unless tag_array && !tag_array.empty? | |
return false unless File.exist?(file) | |
plist = '<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd"><plist version="1.0"><array>' | |
plist += tag_array.map { |tag| "<string>#{tag}</string>" }.join('') | |
plist += '</array></plist>' | |
warn("๐ท๏ธ Tagging #{file} (#{tag_array.join(' ')})") | |
`xattr -w com.apple.metadata:_kMDItemUserTags '#{plist}' #{Shellwords.escape(file)}` | |
end | |
## Should require absolute paths as running as root will expand to wrong path | |
# options[:notes_folder] = File.expand_path(options[:notes_folder]) | |
ld = Linkding.new(options) | |
puts "#{Time.now.strftime('%Y-%m-%d %H:%M')}: Starting" | |
## Store Markdown | |
# retrieve recent bookmarks | |
new_bookmarks = ld.new_bookmarks(options[:note_tag]) | |
puts "#{Time.now.strftime('%Y-%m-%d %H:%M')}: Fetched archive bookmarks" | |
# Retrieve content with specified tool | |
def get_content(options, bookmark, ld) | |
puts "๐ท๏ธ #{options[:tool].to_s =~ /^m/ ? 'Markdownifying' : 'Gathering'} #{bookmark['url']}" | |
if options[:tool].to_s =~ /^m/ | |
ld.marky(bookmark['url']) | |
else | |
ld.gather(bookmark['url']) | |
end | |
end | |
# archive content and merge new bookmarks into main database | |
new_bookmarks.each do |bookmark| | |
content = get_content(options, bookmark, ld) | |
if content.strip.empty? | |
puts '๐ฅ Failed to gather' | |
next | |
end | |
puts 'โ Gathered' | |
content.localize_images!(options) if content && options[:localize_images] | |
# puts content | |
title = bookmark['title'].strip | |
url = bookmark['url'].strip | |
description = bookmark['description'].strip.empty? ? '' : "\ndescription: >\n#{bookmark['description'].indent}" | |
notes = bookmark['notes'].strip.empty? ? '' : "\nnotes: >\n#{bookmark['notes'].indent}" | |
tags = bookmark['tag_names'] | |
tags.delete(options[:note_tag]) if options[:note_tag] && !options[:note_tag].empty? | |
tags = tags.join(', ') | |
added = Time.parse(bookmark['date_added']).strftime('%Y-%m-%d %H:%M') | |
# Template for markdown output, YAML headers and content | |
# description and notes added as header keys | |
template = ERB.new <<~ENDTEMPLATE | |
--- | |
title: "<%= title %>" | |
source: <%= url %> | |
date: <%= added %><%= description %><%= notes %> | |
tags: [<%= tags %>] | |
--- | |
<%= content %> | |
ENDTEMPLATE | |
out = template.result(binding) | |
filename = File.join(options[:notes_folder], "#{bookmark['title'].sanitize}.md") | |
puts "๐พ Writing content to #{filename}" | |
File.open(filename, 'w') do |f| | |
f.puts out | |
end | |
# Set permissions on generated file, in case we're running as root | |
Util.permissions(filename, options[:user]) | |
tags = bookmark['tags'] ? bookmark['tags'].map { |t| Shellwords.escape(t) } : [] | |
apply_tags(filename, tags) unless tags.empty? | |
if options[:save_web_archive] | |
archive_filename = File.join(options[:notes_folder], "#{bookmark['title'].sanitize}.webarchive") | |
FileUtils.rm(archive_filename) if File.exist?(archive_filename) | |
res = ld.to_webarchive(url, archive_filename) | |
if res | |
puts "๐ Web archive saved to #{filename}" | |
Util.permissions(archive_filename, options[:user]) | |
apply_tags(archive_filename, tags) unless tags.empty? | |
else | |
puts 'โ ๏ธ Failed to write webarchive' | |
end | |
end | |
rescue StandardError => e | |
puts "๐ฅ Failed to gather #{bookmark['url']}" | |
puts e | |
puts e.backtrace | |
end | |
puts "๐๏ธ Archived #{new_bookmarks.count} new bookmarks" | |
## Store Weblocs | |
# retrieve recent bookmarks | |
new_bookmarks = ld.new_bookmarks(options[:bookmark_tag], 'webloc') | |
puts "#{Time.now.strftime('%Y-%m-%d %H:%M')}: Fetched webloc bookmarks" | |
# archive content and merge new bookmarks into main database | |
new_bookmarks.each do |bookmark| | |
title = bookmark['title'].strip.sanitize | |
url = bookmark['url'].strip | |
filename = ld.to_webloc(url, title, options[:notes_folder]) | |
if filename | |
puts "๐พ Writing bookmark to #{filename}" | |
Util.permissions(filename, options[:user]) | |
tags = bookmark['tag_names'].map { |t| Shellwords.escape(t) } | |
apply_tags(filename, tags) | |
end | |
rescue StandardError => e | |
puts "๐ฅ Failed to create webloc for #{bookmark['url']}" | |
puts e.message | |
puts e.backtrace | |
end | |
puts "๐ Bookmarked #{new_bookmarks.count} new bookmarks" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment