Skip to content

Instantly share code, notes, and snippets.

@deobald
Last active April 15, 2021 21:44
Show Gist options
  • Save deobald/20e2739553455dd85f19204149845f5f to your computer and use it in GitHub Desktop.
Save deobald/20e2739553455dd85f19204149845f5f to your computer and use it in GitHub Desktop.
Clean up Pariyatti Daily Words translations
#!/usr/bin/env bb
#_( ;; Allow this script to be executed directly
"exec" "bb" -o "--classpath" "." "$0" "$@"
)
;; == How To Use This File ==
;;
;; 0) Copy the English daily words file (`daily_words_one_loop.txt`) and another
;; language (ex. `daily_words_one_loop_espanol.txt`) into the directory where
;; you have downloaded this script.
;; 1) Install Babashka: https://github.com/babashka/babashka#installation
;; 2) Run `chmod +x reorder_txt.clj` to make this file executable.
;; 3) Run this script as follows:
;;
;; ./reorder_txt.clj --english daily_words_one_loop.txt --other daily_words_one_loop_espanol.txt --mp3-marker "Escuchar"
;;
;; 4) Use a diff tool like Meld (https://meldmerge.org) to compare the English TXT file
;; and `output.txt`.
;;
;; 5) Change the "other" language and "mp3-marker" flags as appropriate to clean up
;; TXT files which are not Spanish. This script has only been tested with English/Spanish.
(ns reorder-txt
(:require [clojure.string :as str]
[babashka.fs :as fs] ;; requires babashka 0.2.9
[clojure.tools.cli :refer [parse-opts]]
[clojure.pprint :as pprint :refer [pprint print-table]]))
(defn split-days [txt]
(str/split txt #"~"))
(defn shred [marker entry]
;; we can split on `Listen: ` safely:
;; %% grep "~" daily_words_one_loop.txt | wc -l
;; => 169
;; %% grep "Listen: " daily_words_one_loop.txt | wc -l
;; => 170
;; %% grep "Escuchar: " daily_words_one_loop_espanol.txt | wc -l
;; => 114 ;; 56 translations are missing
(->> (str/split entry (re-pattern marker))
;; (map str/trim) ;; we trim in kosa but we must not trim here.
vec))
(defn repair [marker pair]
[(first pair) (str marker (second pair))])
(defn parse [txt marker]
(let [m (str marker ": ")]
(->> (split-days txt)
;; (map str/trim) ;; we trim in kosa but we must not trim here.
(map #(shred m %))
(map #(repair m %)))))
(defn ingest [f marker]
(parse (slurp f) marker))
(defn insert [entry ot]
(let [pali (first entry)]
[pali (get ot pali (second entry))]))
(defn zip [en ot]
(let [ot-map (into {} ot)]
(map #(insert % ot-map) en)))
(defn squash [pairs]
(->> pairs
(map str/join)
(str/join "~")))
(def cli-options
[[nil "--english ENGLISH_FILE" "English TXT file"
:parse-fn #(str/trim %)]
[nil "--other OTHER_FILE" "Other language TXT file"
:parse-fn #(str/trim %)]
[nil "--mp3-marker MARKER" "Other language's word for 'Listen'"
:parse-fn #(str/trim %)]])
(let [opts (:options (parse-opts *command-line-args* cli-options))
english (:english opts)
other (:other opts)
marker (:mp3-marker opts)]
(assert english "`--english` must be specified")
(assert other "`--other` must be specified")
(assert marker "`--mp3-marker` must be specified")
(assert (str/includes? (slurp other) marker)
(format "File '%s' did not contain the string '%s'. Are you sure you spelled it correctly?"
other marker))
(let [_ (println (format "Ingesting English .txt file: '%s'" english))
en (ingest english "Listen")
_ (println (format "Ingesting Other .txt file: '%s'" other))
ot (ingest other marker)]
(println "Zipping English file and translation file together.")
(let [zipped (zip en ot)
squashed (squash zipped)]
(println "Printing results to file...")
(spit "output.txt" squashed))
#_(spit "en-debug.txt" (with-out-str (pprint (doall en))))
#_(spit "ot-debug.txt" (with-out-str (pprint (doall ot))))
(println "...done. Results written to 'output.txt'.")))
;; Local Variables:
;; mode: clojure
;; End:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment