Created
July 27, 2011 14:48
-
-
Save antoniogarrote/1109507 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(def *levenshtein* (org.tiling.misspelling.metric.LevenshteinDistanceMetric.)) | |
(defn levenshtein-metric | |
([a b] | |
(let [total (+ (count a) (count b)) | |
lev-value (.distance *levenshtein* a b)] | |
(- 1 (/ (double lev-value) (double total)))))) | |
(defn introduce-blanks | |
([b w] (.replace w b " "))) | |
(defn process-url | |
([url] | |
(let [[path query] (str-split url "\\?") | |
paths (str-split path "/")] | |
(->> (concat paths (str-split query "&")) | |
(map #(url-decode %)) | |
(filter #(not (or (empty? %) | |
(= "http:" %) | |
(= 0 (.indexOf % "www"))))) | |
(map #(last (str-split % "="))) | |
(map #(introduce-blanks "-" %)) | |
(map #(introduce-blanks "+" %)))))) | |
(defn detect-fuzzy-matches | |
([kw ws m] | |
(letfn [(fuzzy-score [metric parts-a parts-b] | |
(apply + | |
(map (fn [part-b] | |
(if (> (apply max | |
(map (fn [part-a] | |
(metric part-a part-b)) | |
parts-a)) | |
0.95) | |
1 0)) | |
parts-b)))] | |
(let [parts-kw (.split kw " ") | |
scores (map (fn [w] | |
(let [parts-w (.split w " ") | |
score (fuzzy-score m parts-kw parts-w)] | |
[w score])) | |
ws) | |
scores (sort (fn [[_ sa] [_ sb]] (- (compare sa sb))) scores) | |
[w max-score] (first scores)] | |
(if (> max-score 0) | |
w | |
nil))))) | |
(defn choose-with-metric | |
([m ws] | |
(->> (map #(list % (m %)) ws) | |
(sort (fn [[_ ma] [_ mb]] (- (compare ma mb)))) | |
(first)))) | |
(defn find-search-term | |
([referrer keyword] | |
(find-search-term referrer keyword levenshtein-metric)) | |
([referrer keyword metric] | |
(if (or (empty? referrer) (empty? keyword)) | |
[nil 0.0] | |
(let [url-parts (process-url referrer) | |
fuzzy-match (detect-fuzzy-matches keyword url-parts metric)] | |
(if (nil? fuzzy-match) | |
(choose-with-metric (partial metric keyword) url-parts) | |
[fuzzy-match 1.0]))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment