Last active
September 1, 2020 12:15
-
-
Save mourjo/39cf223fb0f9ceeed893aab0773aba39 to your computer and use it in GitHub Desktop.
An inverted index like Elasticsearch for term filter lookups only
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns plank.core | |
(:import (java.util.regex Pattern))) | |
; Problem Statement: Write code to solve this problem | |
; Given a set of news articles find articles that match the search query from a user. | |
; Assume that the search query is a single word. | |
; | |
; Each article has this structure: Id,Headline,Content | |
; Articles cannot be updated | |
; Sort order of search results is not deterministic | |
;; term -> ids | |
(def ^{:doc "Inverse index for term to id"} term-lookup (atom {})) | |
;; id -> document | |
(def ^{:doc "Id to full document"} id-lookup (atom {})) | |
(def word-pattern (Pattern/compile "\\s")) | |
(defn find-terms | |
"Find terms in a text" | |
[text] | |
(clojure.string/split text word-pattern)) | |
(defn not-empty-text? | |
"Check if a text is empty or not" | |
[text] | |
(not (clojure.string/blank? (clojure.string/trim text)))) | |
(defn insert-terms | |
"Insert terms from a text into the term lookup" | |
[id text] | |
(when (and id | |
text | |
(not-empty-text? id) | |
(not-empty-text? text)) | |
(doseq [word (find-terms text)] | |
(when (not-empty-text? word) | |
(swap! term-lookup | |
(fn [current-state] | |
(update current-state | |
word | |
(fn [ids] | |
(if ids (conj ids id) #{id}))))))))) | |
(defn insert-document | |
"Insert a document into the ID lookup structure for retrieval" | |
[{:keys [id headline content]}] | |
(when (get @id-lookup id) | |
(println "Warning: Overwrite")) | |
(swap! id-lookup | |
(fn [current-state] | |
(assoc current-state | |
id | |
{:headline headline | |
:id id | |
:content content})))) | |
(defn retrieve-document | |
"Given an id, return the full document" | |
[id] | |
(@id-lookup id)) | |
(defn index-document | |
"Given a document, index it into the term lookup and id lookup structures." | |
[{:keys [id headline content] :as document}] | |
(insert-terms id headline) | |
(insert-terms id content) | |
(insert-document document)) | |
(defn search | |
"Given a word, see if this word is present in any article, if present, return ids" | |
[word] | |
(when-let [ids (@term-lookup word)] | |
(mapv retrieve-document ids))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment