Skip to content

Instantly share code, notes, and snippets.

@antoniogarrote
Created July 27, 2011 14:48
Show Gist options
  • Save antoniogarrote/1109507 to your computer and use it in GitHub Desktop.
Save antoniogarrote/1109507 to your computer and use it in GitHub Desktop.
(def *levenshtein* (org.tiling.misspelling.metric.LevenshteinDistanceMetric.))
(defn levenshtein-metric
([a b]
(let [total (+ (count a) (count b))
lev-value (.distance *levenshtein* a b)]
(- 1 (/ (double lev-value) (double total))))))
(defn introduce-blanks
([b w] (.replace w b " ")))
(defn process-url
([url]
(let [[path query] (str-split url "\\?")
paths (str-split path "/")]
(->> (concat paths (str-split query "&"))
(map #(url-decode %))
(filter #(not (or (empty? %)
(= "http:" %)
(= 0 (.indexOf % "www")))))
(map #(last (str-split % "=")))
(map #(introduce-blanks "-" %))
(map #(introduce-blanks "+" %))))))
(defn detect-fuzzy-matches
([kw ws m]
(letfn [(fuzzy-score [metric parts-a parts-b]
(apply +
(map (fn [part-b]
(if (> (apply max
(map (fn [part-a]
(metric part-a part-b))
parts-a))
0.95)
1 0))
parts-b)))]
(let [parts-kw (.split kw " ")
scores (map (fn [w]
(let [parts-w (.split w " ")
score (fuzzy-score m parts-kw parts-w)]
[w score]))
ws)
scores (sort (fn [[_ sa] [_ sb]] (- (compare sa sb))) scores)
[w max-score] (first scores)]
(if (> max-score 0)
w
nil)))))
(defn choose-with-metric
([m ws]
(->> (map #(list % (m %)) ws)
(sort (fn [[_ ma] [_ mb]] (- (compare ma mb))))
(first))))
(defn find-search-term
([referrer keyword]
(find-search-term referrer keyword levenshtein-metric))
([referrer keyword metric]
(if (or (empty? referrer) (empty? keyword))
[nil 0.0]
(let [url-parts (process-url referrer)
fuzzy-match (detect-fuzzy-matches keyword url-parts metric)]
(if (nil? fuzzy-match)
(choose-with-metric (partial metric keyword) url-parts)
[fuzzy-match 1.0])))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment