Created
January 28, 2021 03:23
-
-
Save xorgy/2a0aaaf4fd732be51543f99788144b97 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns je.suis.un-petit-index | |
(:require [clojure.set :refer [intersection]])) | |
(defn- normalize-nfd [^String s] | |
(java.text.Normalizer/normalize s java.text.Normalizer$Form/NFD)) | |
(defn- two-gram [cs] | |
(persistent! | |
(loop [acc (transient []) | |
[head & cs] cs] | |
(if cs | |
(recur (conj! acc [head (first cs)]) cs) | |
acc)))) | |
(defn- mkgrams | |
([s] (mkgrams s 6)) | |
([s n] | |
(let [cs (vec (.toLowerCase (normalize-nfd (str s))))] | |
(reduce | |
into #{} | |
(reduce | |
(fn [acc _] (cons (two-gram (first acc)) acc)) | |
(list cs) | |
(range 0 n)))))) | |
(defn normalized-compare | |
([^String a ^String b] (normalized-compare a b 6)) | |
([^String a ^String b n] | |
(let [sa (mkgrams a n) | |
sca (count sa) | |
sci (count (intersection sa (mkgrams b n)))] | |
(/ sci sca)))) | |
(defn gramdb-to-index [db] | |
(reduce | |
(fn [acc [k grams]] | |
(reduce #(update %1 %2 (fnil conj #{}) k) acc grams)) {} db)) | |
(defn map-to-gramdb | |
([m] (map-to-gramdb m 6)) | |
([m n] | |
(into {} (map (fn [[k v]] [k (mkgrams v n)]) m)))) | |
(defn normalized-gramdb-query | |
([g q] (normalized-gramdb-query g q 6)) | |
([g q n] | |
(let [qg (mkgrams q n) | |
qgc (count qg)] | |
(->> g | |
(pmap (fn [[k v]] [k (count (intersection qg v))])) | |
(filter (fn [[k v]] (not (zero? v)))) | |
(pmap (fn [[k v]] [k (/ v qgc)])) | |
(into {}))))) | |
(defn normalized-index-query | |
([i q] (normalized-index-query i q 6)) | |
([i q n] | |
(let [qg (mkgrams q n) | |
qgc (count qg)] | |
(->> (select-keys i qg) | |
(vals) | |
(apply concat) | |
(frequencies) | |
(pmap (fn [[k v]] [k (/ v qgc)])) | |
(into {}))))) | |
(let [database | |
{ | |
:제26조1 "모든 사람은 교육을 받을 권리를 가진다 . 교육은 최소한 초등 및 기초단계에서는 무상이어야 한다. 초등교육은 의무적이어야 한다. 기술 및 직업교육은 일반적으로 접근이 가능하여야 하며, 고등교육은 모든 사람에게 실력에 근거하여 동등하게 접근 가능하여야 한다." | |
:제26조2 "교육은 인격의 완전한 발전과 인권과 기본적 자유에 대한 존중의 강화를 목표로 한다. 교육은 모든 국가 , 인종 또는 종교 집단간에 이해, 관용 및 우의를 증진하며 , 평화의 유지를 위한 국제연합의 활동을 촉진하여야 한다." | |
:제26조3 "부모는 자녀에게 제공되는 교육의 종류를 선택할 우선권을 가진다 ." | |
:제27조1 "모든 사람은 공동체의 문화생활에 자유롭게 참여하며 예술을 향유하고 과학의 발전과 그 혜택을 공유할 권리를 가진다 ." | |
:제27조2 "모든 사람은 자신이 창작한 과학적 , 문학적 또는 예술적 산물로부터 발생하는 정신적, 물질적 이익을 보호받을 권리를 가진다 ." | |
:foo "this, that, and the other thing thing. A rose by any other name is as sweet." | |
:bar "Another one bites the dust. To be, or not to be, that is the question; whether 'tis nobler in the mind to suffer the slings and arrows of outrageous fortune; or to take arms against a sea of troubles and by opposing, end them." | |
:baz "To die, to sleep. No more; and by a sleep to say we end the heartache and the thousand natural shocks that flesh is heir to. 'Tis a consummation devoutly to be wished. To die. To sleep." | |
:qux "To sleep. Perchance to dream; ay, there's the rub: For in that sleep of death, what dreams may come when we have shuffled off this mortal coil, must give us pause." | |
:quux 69420 | |
:quuux 42069 | |
:fungus :balthazar | |
:virus :bazaar | |
:helicopter #{:fungus "trilogy" :balance} | |
:wiki-a "design kernels that allow machine learning algorithms such as support vector machines to learn from string data" | |
:wiki-b "find likely candidates for the correct spelling of a misspelled word" | |
:wiki-c "improve compression in compression algorithms where a small area of data requires n-grams of greater length" | |
:wiki-d "assess the probability of a given word sequence appearing in text of a language of interest in pattern recognition systems, speech recognition, OCR (optical character recognition), Intelligent Character Recognition (ICR), machine translation and similar applications" | |
:wiki-e "improve retrieval in information retrieval systems when it is hoped to find similar \"documents\" (a term for which the conventional meaning is sometimes stretched, depending on the data set) given a single query document and a database of reference documents" | |
:wiki-f "improve retrieval performance in genetic sequence analysis as in the BLAST family of programs" | |
:wiki-g "identify the language a text is in or the species a small sequence of DNA was taken from" | |
:wiki-h "predict letters or words at random in order to create text, as in the dissociated press algorithm." | |
:wiki-i "cryptanalysis" | |
} | |
gramdb (map-to-gramdb database) | |
index (gramdb-to-index gramdb)] | |
;; the transpose between gramdb and index is reversible | |
(assert (= gramdb (gramdb-to-index index))) | |
;; the result of a query on the gramdb or on the index is identical for the same query | |
(assert (= (normalized-index-query index "#{:balance :fungus}") | |
(normalized-gramdb-query gramdb "#{:balance :fungus}"))) | |
(assert (= (normalized-index-query index 420) | |
(normalized-gramdb-query gramdb 420))) | |
(assert (= (normalized-index-query index "한다.") | |
(normalized-gramdb-query gramdb "한다.")))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment