Skip to content

Instantly share code, notes, and snippets.

@rcy
Created January 31, 2022 21:42
Show Gist options
  • Save rcy/18f2aa8b867cd26e247da9e26addd43f to your computer and use it in GitHub Desktop.
Save rcy/18f2aa8b867cd26e247da9e26addd43f to your computer and use it in GitHub Desktop.
;;; annotate.el --- mark up foreign language buffers according to anki deck
;; Copyright (C) 2009 Ryan Yeske
;; Author: Ryan Yeske <[email protected]>
;; Keywords: languages
;; Version: 2009-03-17 23:04:03
;; This program is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with this program. If not, see <http://www.gnu.org/licenses/>.
;;; Commentary:
;; The main assumption is that your cards have target language
;; sentences or words on the front, and anything on the back, which is
;; ignored.
;; Useful global bindings:
;; (global-set-key (kbd "C-c a") 'annotate-make-flashcard)
;; (global-set-key (kbd "C-c l") 'annotate-look-for-word)
;; (global-set-key (kbd "C-c k") 'annotate-save-text-as-known)
;;; TODO:
;; bugs:
;; - figure out face lossage when switching background colors (light to dark &vv)
;; dictionary:
;; - implement some kinda automated dictionary lookup
;; *report*:
;; - occur on RET on buffer
;; - dump sentences from words on keystroke
;; - count total buffer words
;; - keystroke to add words to ignore file (and implement ignore file)
;; *sentences*:
;; - end of buffer error
;; - hit RET to jump to the sentence in the buffer
;; - hit (keystroke?) to create a card from sentence
;;; Code:
(require 'cl)
(require 'thingatpt)
(defvar annotate-anki-file "~/spanish/srs/initial-spanish.anki")
(defvar annotate-import-file "~/import.txt")
(defvar annotate-looking-for-file "~/looking-for.txt"
"Words in this file will trigger notification when they are
seen in annotated buffers.")
(defvar annotate-unknown-coverage .02
"How much of buffer to mark up with unknown words.")
(defface annotate-mature
'((t (:inherit default)))
"The face used to highlight mature words.")
(defface annotate-young
'((((background light)) (:background "#ccffcc"))
(((background dark)) (:background "#007700")))
"The face used to highlight young words.")
(defface annotate-new
'((((background light)) (:background "#e50000"))
(((background dark)) (:background "#770000")))
"The face used to highlight new words.")
(defface annotate-unknown
'((((background light)) (:background "#ffffaa"))
(((background dark)) (:background "#555500")))
"The face used to highlight unknown words.")
(defface annotate-ignore
'((t (:bold t)))
"The face used to highlight ignored words.")
(defvar annotate-harvest-min-words 4)
(defvar annotate-harvest-max-words 12)
(defun annotate-make-table ()
(make-hash-table :test 'equal))
(defvar annotate-mature-table (annotate-make-table)
"Hash table of known mature words.")
(defvar annotate-known-table (annotate-make-table)
"Hash table of known words.")
(defvar annotate-young-table (annotate-make-table)
"Hash table of known young words.")
(defun annotate-buffer (percentage &optional buffer)
(interactive "p")
(with-current-buffer (or buffer (current-buffer))
(annotate-region (point-min) (point-max)
(sort (annotate-hash-table-to-alist
(annotate-word-frequency annotate-known-table
annotate-known-list))
;; sort by frequency
(lambda (a b) (cond ((= (cdr a) (cdr b))
(zerop (random 2)))
(t (> (cdr a) (cdr b))))))
(and percentage (/ percentage 100.0)))))
(defvar annotate-known-word-file "/home/rcy/known.txt")
(defun annotate-update-tables ()
(interactive)
(annotate-word-table annotate-known-table
annotate-anki-file "interval > 0")
(annotate-word-table annotate-mature-table
annotate-anki-file "interval > 21")
(annotate-word-table annotate-young-table
annotate-anki-file "interval > 0 and interval <= 21"
annotate-mature-table)
(setq annotate-known-list (annotate-word-list-from-file
annotate-known-word-file))
(message "%d %d %d"
(hash-table-count annotate-mature-table)
(hash-table-count annotate-young-table)
(hash-table-count annotate-known-table)))
(defun annotate-save-text-as-known (text)
(interactive (list
(read-from-minibuffer "Known phrase: "
(annotate-cleanup-sentence
(buffer-substring (region-beginning)
(region-end))))))
(with-temp-buffer
(insert text)
(newline)
(append-to-file (point-min) (point-max) annotate-known-word-file)))
(defun annotate-word-list-from-file (file)
(let (word-list)
(with-temp-buffer
(insert-file-contents file)
(annotate-wordify-buffer)
(while (annotate-search-forward-word)
(push (match-string 0) word-list)))
word-list))
(defun annotate-count-anki-sentences ()
"Return the number of facts (unique sentences) in anki deck."
(interactive)
(with-temp-buffer
(call-process "sqlite3" nil (current-buffer) nil
(expand-file-name annotate-anki-file)
(concat "select id from facts"))
(count-lines (point-min) (point-max))))
(defun annotate-stats ()
"Report some stats for graphing"
(interactive)
(let ((mature-tab (annotate-make-table))
(young-tab (annotate-make-table)))
(annotate-word-table mature-tab annotate-anki-file
"interval > 21")
(annotate-word-table young-tab annotate-anki-file
"interval > 0 and interval <= 21" mature-tab)
(let ((mature (hash-table-count mature-tab))
(young (hash-table-count young-tab)))
(message (concat (format-time-string "%Y%m%d")
(format " %d %d %d"
(annotate-count-anki-sentences)
mature
(+ young mature)))))))
(defun annotate-display-known-words ()
(interactive)
(annotate-dump-table annotate-known-table))
(defun annotate-table-words (table)
(let (words)
(maphash (lambda (word freq) (push word words)) table)
words))
(defun annotate-add-word (word table)
"Add WORD to TABLE or increment count if it exists."
(let ((val (gethash word table)))
(puthash word (if val (1+ val) 1) table)))
(defun annotate-markup (beg end face)
"Mark region between BEG and END with FACE."
(add-text-properties beg end `(face ,face rear-nonsticky t)))
(defun annotate-trim-to-coverage (alist coverage total)
"Return a subset of ALIST such that we get COVERAGE of TOTAL."
(let ((cell (car alist))
(current 0.0)
acc)
(while (not (or (>= (/ current total) coverage)
(null cell)))
(let ((count (cdr cell)))
(push cell acc)
(incf current count)
(setq alist (cdr alist))
(setq cell (car alist))))
acc))
(defun annotate-set-header-line (alist)
(setq header-line-format (mapconcat 'car alist " ")))
(defvar annotate-document-table nil)
(defvar annotate-common-alist nil
"Alist of common unknown word and frequency pairs.")
(defun annotate-region (start end word-frequency-alist &optional coverage)
(interactive "r")
;; prepare buffer
(let ((inhibit-read-only t)
(modified-p (buffer-modified-p)))
(remove-text-properties start end '(face))
;; take the top words from word-frequency-alist
(set (make-local-variable 'annotate-common-alist)
(reverse
(annotate-trim-to-coverage word-frequency-alist
(or coverage annotate-unknown-coverage)
(annotate-count-words-region
(point-min) (point-max)))))
(annotate-set-header-line annotate-common-alist)
(save-excursion
(goto-char start)
(let ((all (annotate-make-table))
(total-words 0))
(while (annotate-search-forward-word end)
(incf total-words)
(let ((word (downcase (match-string 0)))
(mbeg (match-beginning 0))
(mend (match-end 0)))
(annotate-add-word word all)
;; look up word for markup
(cond
;; its a common unknown word
((assoc word annotate-common-alist)
(annotate-markup mbeg mend 'annotate-new))
;; its a word recently being studied
((gethash word annotate-young-table)
(annotate-markup mbeg mend 'annotate-young))
;; do nothing
(t))))
(annotate-convert-table all total-words)
(set (make-local-variable 'annotate-document-table)
all)))
(set-buffer-modified-p modified-p)))
(defun annotate-phrase-regexp (phrase)
"Return a regexp to match PHRASE with any case, whitespace or punctuation."
(mapconcat (lambda (word)
(concat "\\(\\<" word "\\>\\)"))
(annotate-wordify-sentence phrase)
"[^[:alpha:]]+"))
(defun annotate-count-words-region (beg end)
"Return the number of words between BEG and END."
(save-excursion
(goto-char beg)
(let ((count 0))
(while (annotate-search-forward-word end)
(incf count))
count)))
(defun annotate-setup-flashcard-line ()
(interactive)
(let* ((marker (get-text-property (point) 'occur-target))
(file (buffer-file-name (marker-buffer marker)))
(posn (marker-position marker)))
(annotate-setup-flashcard-region (line-beginning-position)
(line-end-position)
file posn)))
(defun annotate-setup-flashcard-region ()
(interactive)
(save-excursion
(annotate-make-flashcard)))
(define-derived-mode annotate-sentence-mode text-mode "Sentence"
"Major mode for viewing harvested sentences.
\\{annotate-sentence-mode-map}"
)
;;; hack to create a list of common unknown words, from a word
;;; frequency list and from the list of words in anki
(defun annotate-display-unknown-words ()
"Display a buffer containing words in current buffer that are unknown."
(interactive)
(let ((words (annotate-find-unknown-words
(buffer-substring (point-min) (point-max))
annotate-known-table)))
(pop-to-buffer "*unknown*")
(erase-buffer)
(dolist (word words)
(insert word)
(newline))))
(defun annotate-find-hunted (start end)
"Mark hunted phrases in current buffer between START and END."
(interactive "r")
(save-excursion
(goto-char start)
(let ((re (with-temp-buffer
(insert-file-contents-literally annotate-looking-for-file)
(mapconcat (lambda (phrase)
(annotate-phrase-regexp phrase))
(split-string (buffer-substring (point-min) (point-max))
"\n" t)
"\\|")))
(start (point)))
(while (re-search-forward re end t)
(annotate-markup (match-beginning 0) (match-end 0)
'font-lock-warning-face)))))
(defun annotate-phrase-regexp (phrase)
"Return a regexp to match PHRASE with any case, whitespace or punctuation."
(mapconcat (lambda (word)
(concat "\\(\\<" word "\\>\\)"))
(annotate-wordify-sentence phrase)
"[^[:alpha:]]+"))
(defun annotate-search-forward-word (&optional bound)
"Search forward one word. Moves point to end of search.
Optional argument bounds the search; it is a buffer position.
Returns nil if there is no word to move forward over.
See also the functions `match-beginning', `match-end', `match-string',
and `replace-match'."
;; this is the point at which we could conditionalize the regexp or
;; search logic for chinese characters or other languages that arent
;; as trivial as moving across alpha characters
(re-search-forward "[[:alpha:]]+" bound t))
(defun annotate-cleanup-sentence (string)
"Remove properties and extra whitespace from STRING."
(when string
(mapconcat 'substring-no-properties (split-string string) " ")))
(defun annotate-sentence-words (sentence)
"Return a list of downcased words in sentence.
Non words are removed, as are duplicate words.
Word order not preserved."
;;; FIXME: consider reusing wordify-buffer for this (also think
;;; about duplicate word issue)
(when sentence
(with-temp-buffer
(insert sentence)
(goto-char (point-min))
(let ((start (point))
wordlist)
(while (re-search-forward "[[:alpha:]]+" nil t)
(let ((word (downcase (match-string 0))))
(push word wordlist)))
wordlist))))
(defvar annotate-word-sentence-table nil) ;FIXME: make buffer
;local, for multiple
;concurrent sentence
;mining processes
(defun annotate-build-sentence-table (&optional buffer)
(with-current-buffer (or buffer (current-buffer))
(setq annotate-word-sentence-table (annotate-make-table))
(goto-char (point-min))
(while (not (>= (point) (point-max)))
(forward-sentence)
(sit-for 0)
(let* ((sentence (annotate-cleanup-sentence (sentence-at-point)))
(words (annotate-wordify-sentence sentence))
(slen (length words)))
(if (and (>= slen annotate-harvest-min-words)
(<= slen annotate-harvest-max-words))
(dolist (word words)
(let ((s (propertize sentence
'occur-target
(set-marker (make-marker) (point))))
(ss (gethash word annotate-word-sentence-table)))
(puthash word (if ss
(cons s ss)
(list s))
annotate-word-sentence-table))))))))
(defun annotate-mine-sentences (&optional buffer)
(interactive)
(annotate-build-sentence-table buffer)
(let ((freq-table))
(maphash (lambda (w ss)
(setq freq-table (cons (cons w (length ss)) freq-table)))
annotate-word-sentence-table)
(setq freq-table
(sort freq-table
(lambda (a b)
(let ((ka (gethash (car a) annotate-known-table))
(kb (gethash (car b) annotate-known-table)))
(if (or (and ka kb)
(and (not ka) (not kb)))
(> (cdr a) (cdr b))
kb)))))
(with-current-buffer (get-buffer-create "*sentences*")
(erase-buffer)
(annotate-sentence-mode)
(dolist (cell freq-table)
(insert (format "\n* %s %d\n\n" (car cell) (cdr cell)))
(message (car cell)) (sit-for 0)
(let ((opoint (point)))
(dolist (sentence (gethash (car cell) annotate-word-sentence-table))
(insert (format " %.3f. %s\n"
(annotate-score-words (annotate-wordify-sentence
sentence)
annotate-known-table)
sentence)))
(sort-lines t opoint (point))))
(pop-to-buffer (current-buffer))
(goto-char (point-min)))))
(defun annotate-harvest-sentences-buffer ()
"Harvest sentences from buffer.
The region must already be annotated with `annotate-region'."
(interactive)
(annotate-harvest-sentences-region (point-min) (point-max)))
(defun annotate-phrase-at-point ()
"Return string including word at point and 4 words on either side."
(save-excursion
(backward-word 5)
(let ((start (point)))
(forward-word 9)
(annotate-cleanup-sentence (buffer-substring start (point))))))
(defun annotate-mine-phrases ()
(interactive)
(let ((table (annotate-mine-phrases-1 (current-buffer))))
(annotate-display-phrases table)))
(defun annotate-mine-phrases-1 (buffer &optional table)
(save-excursion
(with-current-buffer buffer
(goto-char (point-min))
(while (re-search-forward "\\<[[:alpha:]]+\\>" nil t)
(let ((word (downcase (word-at-point))))
(if (not (annotate-known-p word))
;; find 4 words before and 4 after for a phrase
(let ((phrase (annotate-phrase-at-point)))
;; if there is more than 1 unknown word, its a candidate phrase
(unless (cdr (annotate-find-unknown-words phrase annotate-known-table))
(let ((string (propertize (concat "..." phrase "...")
'occur-target
(set-marker (make-marker) (point)))))
(push (cons word string) table)))))))
table)))
(defvar annotate-reading-directory "~/spanish/reading")
(defun annotate-mine-phrases-files ()
"Mine phrases from all files in `annotate-reading-directory'."
(interactive)
(let ((files (directory-files annotate-reading-directory
t "^[^.].*\\.txt$" t))
table)
(dolist (file files)
(find-file file)
(setq table (annotate-mine-phrases-1 (current-buffer) table)))
(annotate-display-phrases table)))
(defun annotate-display-phrases (table)
(let ((buf (current-buffer)))
(with-current-buffer (get-buffer-create "*phrases*")
(erase-buffer)
(save-restriction
(dolist (cell (reverse table))
(insert (cdr cell))
(newline))
(pop-to-buffer (current-buffer))
(goto-char (point-min))
(annotate-buffer)
(annotate-sentence-mode)))))
(defun annotate-score-words (words knowntab)
"Return the average frequencies of WORDS in KNOWNTAB."
(let ((total 0.0))
(dolist (word words)
(incf total (or (gethash word knowntab) 0)))
(/ total (length words))))
(defun annotate-sentence-mode-goto-sentence ()
(interactive)
(save-excursion
(end-of-line)
(backward-char)
(occur-mode-goto-occurrence)))
(defvar annotate-sentence-mode-map
(let ((map (make-sparse-keymap)))
(define-key map (kbd "RET") 'annotate-sentence-mode-goto-sentence)
(define-key map "a" 'annotate-setup-flashcard-line)
map)
"Keymap for `annotate-sentence-mode'.")
(defvar annotate-known-list nil)
(defun annotate-known-p (word)
(or (gethash word annotate-known-table) ;the anki db
(member word annotate-known-list))) ;the known.txt file
(defun annotate-find-unknown-words (sentence known-table)
"Return a list of words from SENTENCE that are not in KNOWN-TABLE."
(let (unknowns)
(dolist (word (annotate-wordify-sentence sentence))
(unless (annotate-known-p word)
(push word unknowns)))
unknowns))
(define-derived-mode annotate-sentence-mode text-mode "Sentence"
"Major mode for viewing harvested sentences.
\\{annotate-sentence-mode-map}"
)
(defun annotate-wordify-sentence (string)
"Return a list of words, in order, downcased, from string."
(when string
(with-temp-buffer
(insert string)
(annotate-wordify-buffer)
(split-string (buffer-substring-no-properties
(point-min) (point-max))))))
(defun annotate-wordify-buffer ()
"Remove punctuation and extra whitespace, leaving just one word per line."
(goto-char (point-min))
(while (re-search-forward "\\b[^[:alpha:]]+\\b" nil t)
(replace-match "\n" t t))
(flush-lines "[[:digit:]]" (point-min) (point-max))
(downcase-region (point-min) (point-max)))
(defun annotate-word-table (table file selection &rest override-tables)
"Update TABLE with word frequences from anki FILE given sql SELECTION.
Update OVERRIDE-TABLES if the word exists in one of them.
Return the total number of words processed."
(with-temp-buffer
(when (not
(zerop
(call-process "sqlite3" nil (current-buffer) nil
(expand-file-name file)
(concat "select question from cards where " selection ";"))))
(error "%s" (buffer-substring (point-min) (point-max))))
(goto-char (point-min))
;; remove <span> markup
(while (re-search-forward "<.*?>" nil t) (replace-match " "))
;; build a hash table of word frequencies
(annotate-convert-table
table
(apply 'annotate-word-table-from-buffer
table
(current-buffer) (point-min) (point-max)
override-tables))))
(defun annotate-word-table-from-buffer (table buffer start end &rest override-tables)
"Update hash TABLE of words and number of occurrances
from BUFFER between START and END.
Return total number of words in buffer."
(with-temp-buffer
(insert-buffer-substring-no-properties buffer start end)
;; one word per line, and no punctuation
(annotate-wordify-buffer)
(goto-char (point-min))
(clrhash table)
(let ((word-count 0))
(while (forward-word)
(incf word-count)
(let ((word (word-at-point))
exists)
;; update in overriding tables, if it exists there
(dolist (otab override-tables)
(when (gethash word otab)
(annotate-add-word word otab)
(setq exists t)))
;; otherwise add to new table
(unless exists
(annotate-add-word word table))))
word-count)))
(defun annotate-dump-table (table)
(with-current-buffer (get-buffer-create "*dump*")
(erase-buffer)
(maphash (lambda (w f) (insert (format "%05d: %s\n" f w)))
table)
(pop-to-buffer (current-buffer))
(sort-regexp-fields t "^\\(.*\\):.*" "\\1" (point-min) (point-max))
(annotate-buffer)
(goto-char (point-min))))
(defun annotate-convert-table (tab words-total)
"Convert a table with integer values into ratio values."
(let ((total (float words-total)))
(maphash (lambda (w c)
(puthash w (/ c total) tab))
tab)))
(defun annotate-word-frequency (&optional table-filter list-filter)
"Return an table of word/frequency pairs.
Ignore words in TABLE-FILTER and LIST-FILTER."
(save-excursion
(goto-char (point-min))
(let ((tab (annotate-make-table))
(words-total 0))
(while (annotate-search-forward-word (point-max))
(let ((word (downcase (match-string 0))))
(unless (or (and table-filter (gethash word table-filter))
(member word list-filter))
(annotate-add-word word tab))))
tab)))
(defun annotate-hash-table-to-alist (table)
(let (alist)
(maphash (lambda (word freq)
(setq alist (cons (cons word freq) alist)))
table)
alist))
;;; flashcard generation
(defvar annotate-flashcard-mode-map
(let ((map (make-sparse-keymap)))
(define-key map (kbd "C-c C-c") 'annotate-flashcard-submit)
(define-key map (kbd "C-c C-i") 'annotate-flashcard-insert-unknown-words)
map)
"Keymap for `annotate-flashcard-mode'.")
(define-derived-mode annotate-flashcard-mode text-mode "Flashcard"
"Major mode for creating flashcards.
\\{annotate-flashcard-mode-map}"
(setq fill-paragraph-function 'annotate-flashcard-fill-paragraph)
(setq fill-column 1000))
(defun annotate-flashcard-fill-paragraph (&optional justify region)
(save-restriction
(narrow-to-region (field-beginning) (field-end))
(let (fill-paragraph-function)
(fill-paragraph justify region))))
(defvar annotate-card-divider
"\n \n")
(defvar annotate-window-configuration nil)
(defun annotate-make-flashcard (&optional front)
(let ((sentence front))
(setq annotate-window-configuration
(current-window-configuration))
(let ((buf (generate-new-buffer-name "*card*")))
(pop-to-buffer (get-buffer-create buf)))
(let ((inhibit-read-only t)) (erase-buffer))
(annotate-flashcard-mode)
(if sentence
(insert sentence)
(newline))
(while (re-search-forward "^[^ :]+: \\([ \t]+\\)" nil t)
(delete-region (match-beginning 1) (match-end 1)))
;; remove whitespace we dont need
(whitespace-cleanup-region (point-min) (point-max))
;; leading space isnt handled by whitespace-cleanup
(goto-char (point-min))
(save-excursion
(while (re-search-forward "^[[:space:]]+" nil t)
(delete-region (match-beginning 0) (match-end 0))))
(goto-char (point-max))
(insert (propertize annotate-card-divider
'face 'font-lock-preprocessor-face
'field t
'rear-nonsticky t))
(unless sentence
(goto-char (point-min)))
(set-input-method 'spanish-postfix)))
(defun annotate-flashcard-submit ()
"Add card to `annotate-import-file'."
;; replace divider with tab character, and newlines with <BR>
(interactive)
(let ((inhibit-read-only t))
(goto-char (point-min))
(re-search-forward annotate-card-divider)
(replace-match "\t")
;; remove final newline, if present.
(goto-char (point-max))
(when (looking-at "^$")
(delete-backward-char 1))
;; convert remaining newlines
(goto-char (point-min))
(while (re-search-forward "\n" nil t)
(replace-match "<BR>"))
(goto-char (point-max))
(newline)
;; add to import file
(append-to-file (point-min) (point-max) annotate-import-file)
(kill-buffer)
(set-window-configuration annotate-window-configuration)))
(defun annotate-flashcard-insert-unknown-words ()
(interactive)
(let (words)
(save-excursion
(goto-char (point-min))
(re-search-forward annotate-card-divider)
(goto-char (match-beginning 0))
(setq words (annotate-find-unknown-words
(buffer-substring (point-min) (point))
annotate-known-table)))
(save-excursion
(dolist (word (reverse words))
(insert (concat word ": "))
(newline 2)))
(end-of-line)))
(defun annotate-add-sentence (sentence)
(when sentence
(save-excursion
(with-current-buffer (get-buffer-create "*sentence pile*")
(goto-char (point-max))
(insert (format "%s\n\n" sentence))))))
(defun annotate-collect-sentences (&optional buffer)
(with-current-buffer (or buffer (current-buffer))
(setq annotate-word-sentence-table (annotate-make-table))
(goto-char (point-min))
(while (not (>= (point) (point-max)))
(forward-sentence)
(sit-for 0)
(let* ((sentence (annotate-cleanup-sentence (sentence-at-point))))
(annotate-add-sentence sentence)))))
(defun annotate-dump-words ()
(interactive)
(annotate-update-tables)
(with-temp-buffer
(maphash (lambda (word freq)
(insert (format "%s\n" word)))
annotate-mature-table)
(sort-lines nil (point-min) (point-max))
(write-region (point-min) (point-max)
(concat "/home/rcy/spanish/words/"
(format-time-string "%Y%m%d")
".txt"))))
(provide 'annotate)
;;; annotate.el ends here
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment