Last active
May 24, 2017 11:14
-
-
Save jackrusher/8417b6c4c56927f5bb20 to your computer and use it in GitHub Desktop.
A bit of poor man's stylometry using Etymological Wordnet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; Some fun with the etymological wordnet database | |
;; <http://www1.icsi.berkeley.edu/~demelo/etymwn/> | |
;; you'll need the database from that link and this table | |
;; <https://gist.github.com/jackrusher/b42152c40cb56b466085> | |
;; to eval the code in this gist within your own environment. | |
(defn fixup-name [name] | |
(-> name | |
clojure.string/lower-case | |
(clojure.string/replace #": " "-") | |
(clojure.string/replace #" " "-") | |
keyword)) | |
(def etymology | |
(with-open [rdr (clojure.java.io/reader "data/etymwn.tsv")] | |
(reduce (fn [m line] (let [[s p o] (clojure.string/split line #"\t")] | |
(cond (or (= p "rel:is_derived_from") | |
(= p "rel:etymology")) | |
(let [fs (fixup-name s)] | |
(assoc m fs (distinct (conj (get m fs []) (fixup-name o))))) | |
;; other direction | |
(= p "rel:etymological_origin_of") | |
(let [fo (fixup-name o)] | |
(assoc m fo (distinct (conj (get m fo []) (fixup-name s))))) | |
:else m))) | |
{} | |
(line-seq rdr)))) | |
(defn trace-etymology | |
([etym] (distinct (trace-etymology etym #{}))) | |
([etym seen] (conj (mapcat #(trace-etymology % (conj seen %)) | |
(remove seen (etymology etym))) etym))) | |
;; we can thus trace the etymology of a word: | |
(trace-etymology :eng-thou) | |
;; =>(:eng-thou :enm-thou :ang-þu :p_gem-thu :p_ine-tu) | |
(trace-etymology :eng-you) | |
;; (:eng-you :enm-you :ang-eow :p_gmw-iuwiz :p_ine-ju :ang-eow :p_gmw-iuwiz :p_ine-ju) | |
(trace-etymology :deu-mandel) | |
;; =>(:deu-mandel :goh-mandala :lat-amygdala :lat-amygdalum :grc-ἀμύγδαλον :grc-ἀμυγδάλη :grc-ἀμυγδάλη) | |
;; how about some style analysis? | |
(defn tokenize [lang phrase] | |
(-> phrase | |
clojure.string/lower-case | |
(clojure.string/split #"[^\w]+") | |
(#(mapv (fn [w] (keyword (str lang "-" w))) %)))) | |
(defn word-origin-frequencies [lang text] | |
(frequencies | |
(mapcat (comp (partial map #(subs (str %) 1 (.indexOf (str %) "-"))) | |
(partial remove #(re-find #"^:eng-|^:p_" (str %))) | |
trace-etymology) | |
(tokenize lang text)))) | |
(defn report-percentages [rs] | |
(let [tot (apply + (vals rs))] | |
(sort-by second > (map #(vector (iso-639-b (first %)) (float (* 100 (/ (second %) tot )))) rs)))) | |
;; Chuchill does blood and guts for the common man | |
(report-percentages | |
(word-origin-frequencies "eng" "We shall go on to the end, we shall | |
fight in France, we shall fight on the seas and oceans, we shall | |
fight with growing confidence and growing strength in the air, we | |
shall defend our Island, whatever the cost may be, we shall fight on | |
the beaches, we shall fight on the landing grounds, we shall fight in | |
the fields and in the streets, we shall fight in the hills; we shall | |
never surrender, and even if, which I do not for a moment believe, | |
this Island or a large part of it were subjugated and starving, then | |
our Empire beyond the seas, armed and guarded by the British Fleet, | |
would carry on the struggle, until, in God's good time, the New | |
World, with all its power and might, steps forth to the rescue and | |
the liberation of the old.")) | |
;; => | |
(["English, Old (ca.450-1100)" 42.92453] | |
["English, Middle (1100-1500)" 35.377357] | |
["Latin" 9.905661] | |
["French, Old (842-ca.1400)" 4.245283] | |
["Norse, Old" 2.8301888] | |
["French" 1.4150944] | |
["Anglo-Norman" 1.4150944] | |
["Italian" 0.9433962] | |
["Middle Low German" 0.4716981] | |
["Greek, Ancient (to 1453)" 0.4716981]) | |
;; Chuchill talking to his peers | |
(report-percentages | |
(word-origin-frequencies "eng" "A new Government has come into being | |
under a Prime Minster who, like his predecessor whose loss we all | |
profoundly deplore, and whose many virtues all parties have joined to | |
celebrate - a new Prime Minister has come into power, tied to | |
Scotland by strong and intimate bonds. Give him a fair chance. [Hear, | |
hear.] Give the Government which he has brought into being the | |
opportunity of handling the great machinery of State. Be assured | |
that, if you do, they will employ it for the greatest good of the | |
greatest number. I am well satisfied at what has taken place in the | |
last four or five days since I have been in Dundee. I see a great | |
concentration of forces throughout the constituency. I see the | |
opportunity of retrieving, and more than retrieving, the injury which | |
has been done to the cause of progress and reform by elections in | |
other parts of our land.")) | |
;; => | |
(["English, Middle (1100-1500)" 36.39576] | |
["English, Old (ca.450-1100)" 29.681978] | |
["Latin" 20.848057] | |
["French, Old (842-ca.1400)" 5.3003535] | |
["French" 1.7667844] | |
["Anglo-Norman" 1.7667844] | |
["Italian" 1.4134276] | |
["Norse, Old" 1.0600706] | |
["French, Middle (ca.1400-1600)" 0.7067138] | |
["Greek, Ancient (to 1453)" 0.7067138] | |
["Gaelic; Scottish Gaelic" 0.3533569]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment