Created
February 6, 2020 12:44
-
-
Save sirupsen/03fb05cc7efa6a63c8d5c9737fb0686f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "sqlite3" | |
require 'set' | |
require 'byebug' | |
# Will be rebuilt at any time. Nice and incremental. | |
db = SQLite3::Database.new "index.db" | |
# Keep prefix indexes for "mos*" searches. | |
# | |
# TODO: It doesn't seem like SQLITE FTS5 supports synonyms well. That's ok, but | |
# we're going to want that. We can download this database from Princeton, write | |
# a parser for it (or use grind(1)). This should allow us to do potentially do | |
# `OR` queries. Alternatively, and probably better, woluld be to see if Lucene | |
# supports this. | |
# | |
# https://wordnet.princeton.edu/download/current-version | |
db.execute <<-SQL | |
CREATE VIRTUAL TABLE IF NOT EXISTS zettelkasten | |
USING fts5(title, body, tags, mtime UNINDEXED, prefix = 3, tokenize = "porter unicode61"); | |
SQL | |
# Weigh tags higher, and title a bit higher. | |
db.execute <<-SQL | |
INSERT INTO zettelkasten (zettelkasten, rank) VALUES('rank', 'bm25(2.0, 1.0, 5.0, 0.0)'); | |
SQL | |
raw_existing = db.execute("SELECT title, mtime FROM zettelkasten") | |
existing = Hash[raw_existing.map { |e| [e[0], Time.parse(e[1]).to_i] }] | |
Dir["*.md"].each do |path| | |
mtime = File.stat(path).mtime | |
# Any file that's been modified since its entry in the full-text search index | |
# will get updated (or if it doesn't exist, of course). | |
if !existing[path] | |
contents = File.read(path) | |
tags = contents.scan(/#[\w-]+/).join(" ") | |
db.execute(<<-SQL, [path, contents, tags, File.stat(path).mtime.to_s]) | |
INSERT INTO zettelkasten (title, body, tags, mtime) VALUES (?, ?, ?, ?); | |
SQL | |
elsif mtime.to_i > existing[path] # to_i because the stat may have more precision | |
contents = File.read(path) | |
tags = contents.scan(/#[\w-]+/).join(" ") | |
db.execute(<<-SQL, [contents, tags, mtime.to_s, path]) | |
UPDATE zettelkasten SET body = ?, tags = ?, mtime = ? WHERE title = ? | |
SQL | |
end | |
existing[path] = 'VISITED' | |
end | |
# Delete any entries in the full text index that don't have files! | |
existing.each do |(path, present)| | |
puts db.execute("DELETE FROM zettelkasten WHERE title = ?;", [path]) unless present == 'VISITED' | |
end | |
file_cat = ARGV.delete("-f") | |
# For preview | |
if file_cat | |
if !ARGV[1].empty? | |
results = db.execute(<<-SQL, ARGV[0], ARGV[1]) | |
SELECT rank, highlight(zettelkasten, 1, '\x1b[0;41m', '\x1b[0m') | |
FROM zettelkasten WHERE title = ? AND zettelkasten MATCH ? ORDER BY rank; | |
SQL | |
# This is when it starts and there's no query input... | |
else | |
results = db.execute(<<-SQL, ARGV[0]) | |
SELECT rank, body FROM zettelkasten WHERE title = ?; | |
SQL | |
end | |
elsif ARGV[0] | |
# Ideally we'd use the search to also `cat` instead of using `bat`, in order | |
# to provide highlighting within the document. | |
results = db.execute(<<-SQL, ARGV.join(" ")) | |
SELECT rank, highlight(zettelkasten, 0, '\x1b[0;41m', '\x1b[0m') | |
FROM zettelkasten WHERE zettelkasten MATCH ? ORDER BY rank; | |
SQL | |
else | |
results = db.execute("SELECT title FROM zettelkasten;") | |
end | |
results.each do |(_score, content)| | |
# puts score | |
puts content | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment