Last active
          April 15, 2021 21:44 
        
      - 
      
- 
        Save deobald/20e2739553455dd85f19204149845f5f to your computer and use it in GitHub Desktop. 
    Clean up Pariyatti Daily Words translations
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | #!/usr/bin/env bb | |
| #_( ;; Allow this script to be executed directly | |
| "exec" "bb" -o "--classpath" "." "$0" "$@" | |
| ) | |
| ;; == How To Use This File == | |
| ;; | |
| ;; 0) Copy the English daily words file (`daily_words_one_loop.txt`) and another | |
| ;; language (ex. `daily_words_one_loop_espanol.txt`) into the directory where | |
| ;; you have downloaded this script. | |
| ;; 1) Install Babashka: https://github.com/babashka/babashka#installation | |
| ;; 2) Run `chmod +x reorder_txt.clj` to make this file executable. | |
| ;; 3) Run this script as follows: | |
| ;; | |
| ;; ./reorder_txt.clj --english daily_words_one_loop.txt --other daily_words_one_loop_espanol.txt --mp3-marker "Escuchar" | |
| ;; | |
| ;; 4) Use a diff tool like Meld (https://meldmerge.org) to compare the English TXT file | |
| ;; and `output.txt`. | |
| ;; | |
| ;; 5) Change the "other" language and "mp3-marker" flags as appropriate to clean up | |
| ;; TXT files which are not Spanish. This script has only been tested with English/Spanish. | |
| (ns reorder-txt | |
| (:require [clojure.string :as str] | |
| [babashka.fs :as fs] ;; requires babashka 0.2.9 | |
| [clojure.tools.cli :refer [parse-opts]] | |
| [clojure.pprint :as pprint :refer [pprint print-table]])) | |
| (defn split-days [txt] | |
| (str/split txt #"~")) | |
| (defn shred [marker entry] | |
| ;; we can split on `Listen: ` safely: | |
| ;; %% grep "~" daily_words_one_loop.txt | wc -l | |
| ;; => 169 | |
| ;; %% grep "Listen: " daily_words_one_loop.txt | wc -l | |
| ;; => 170 | |
| ;; %% grep "Escuchar: " daily_words_one_loop_espanol.txt | wc -l | |
| ;; => 114 ;; 56 translations are missing | |
| (->> (str/split entry (re-pattern marker)) | |
| ;; (map str/trim) ;; we trim in kosa but we must not trim here. | |
| vec)) | |
| (defn repair [marker pair] | |
| [(first pair) (str marker (second pair))]) | |
| (defn parse [txt marker] | |
| (let [m (str marker ": ")] | |
| (->> (split-days txt) | |
| ;; (map str/trim) ;; we trim in kosa but we must not trim here. | |
| (map #(shred m %)) | |
| (map #(repair m %))))) | |
| (defn ingest [f marker] | |
| (parse (slurp f) marker)) | |
| (defn insert [entry ot] | |
| (let [pali (first entry)] | |
| [pali (get ot pali (second entry))])) | |
| (defn zip [en ot] | |
| (let [ot-map (into {} ot)] | |
| (map #(insert % ot-map) en))) | |
| (defn squash [pairs] | |
| (->> pairs | |
| (map str/join) | |
| (str/join "~"))) | |
| (def cli-options | |
| [[nil "--english ENGLISH_FILE" "English TXT file" | |
| :parse-fn #(str/trim %)] | |
| [nil "--other OTHER_FILE" "Other language TXT file" | |
| :parse-fn #(str/trim %)] | |
| [nil "--mp3-marker MARKER" "Other language's word for 'Listen'" | |
| :parse-fn #(str/trim %)]]) | |
| (let [opts (:options (parse-opts *command-line-args* cli-options)) | |
| english (:english opts) | |
| other (:other opts) | |
| marker (:mp3-marker opts)] | |
| (assert english "`--english` must be specified") | |
| (assert other "`--other` must be specified") | |
| (assert marker "`--mp3-marker` must be specified") | |
| (assert (str/includes? (slurp other) marker) | |
| (format "File '%s' did not contain the string '%s'. Are you sure you spelled it correctly?" | |
| other marker)) | |
| (let [_ (println (format "Ingesting English .txt file: '%s'" english)) | |
| en (ingest english "Listen") | |
| _ (println (format "Ingesting Other .txt file: '%s'" other)) | |
| ot (ingest other marker)] | |
| (println "Zipping English file and translation file together.") | |
| (let [zipped (zip en ot) | |
| squashed (squash zipped)] | |
| (println "Printing results to file...") | |
| (spit "output.txt" squashed)) | |
| #_(spit "en-debug.txt" (with-out-str (pprint (doall en)))) | |
| #_(spit "ot-debug.txt" (with-out-str (pprint (doall ot)))) | |
| (println "...done. Results written to 'output.txt'."))) | |
| ;; Local Variables: | |
| ;; mode: clojure | |
| ;; End: | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment