Skip to content

Instantly share code, notes, and snippets.

@rcarmo
Created August 4, 2014 14:26
Show Gist options
  • Save rcarmo/a8904e51779302431c54 to your computer and use it in GitHub Desktop.
Save rcarmo/a8904e51779302431c54 to your computer and use it in GitHub Desktop.
(import [os [walk environ]]
[os.path [dirname exists join getmtime]]
[datetime [datetime]]
[time [time]]
[whoosh.writing [BufferedWriter AsyncWriter]]
[whoosh.index [create-in open-dir]]
[whoosh.fields [Schema TEXT STORED ID KEYWORD NUMERIC DATETIME]]
[xml.etree [cElementTree]]
[bs4 [BeautifulSoup]]
[hashlib [sha1]]
[PIL [Image]]
[StringIO [StringIO]]
[functools [partial]])
(defmacro timeit [block]
`(let [[t (time)]]
~block
(print (- (time) t))))
(def metadata-file "metadata.opf")
(def thumbnail-size (, 128 128))
(defn create-index [path]
(let [[schema (apply Schema []
{"id" (apply ID [] {"stored" true "unique" true})
"path" (apply TEXT [] {"stored" true})
"mtime" (apply DATETIME [] {"stored" true "sortable" true})
"title" (apply TEXT [] {"stored" true "sortable" true})
"creator" (apply TEXT [] {"stored" true "sortable" true})
"calibre:series" (apply TEXT [] {"stored" true "sortable" true})
"calibre:series_index" (apply TEXT [] {"stored" true "sortable" true})
"description" (apply TEXT [] {"stored" true})
"cover" STORED})]]
(create-in path schema)))
(defn open-index [path]
(open-dir path))
(defn gen-metadata [path]
; provides a sequence of metadata files to parse
(for [loc (walk path)]
(if (in metadata-file (get loc 2))
(yield (get loc 0)))))
(defn plaintext [buffer]
; flattens HTML into plaintext
(.get-text (BeautifulSoup buffer)))
(defn extract-field [element field]
; extracts a given field out of an XML tree
(let [[ns "{http://purl.org/dc/elements/1.1/}"]]
(try
(plaintext (.next (.itertext (get (.findall element (.format ".//{0}{1}" ns field)) 0))))
(catch [e Exception]
nil))))
(defn extract-meta [element field]
; extracts a metadata attribute out of an XML tree
(try
(get (get (.findall element (.format ".//meta[@name='{0}']")) 0) "content")
(catch [e Exception]
nil)))
(defn get-cover [path]
; generate a smaller thumbnail from the on-disk cover
(let [[cover (join path "cover.jpg")]]
(if (exists cover)
(let [[im (.open Image cover)]
[buffer (StringIO)]]
(.thumbnail im thumbnail-size Image.ANTIALIAS)
(apply .save [im buffer "JPEG"] {"quality" (int 75)
"optimize" true
"progressive" true})
(.getvalue buffer)))))
(defn parse-one [path]
; parse a metadata file and return the fields we want to index
(let [[doc (.getroot (.parse cElementTree (open (join path metadata-file) "r")))]
[fields ["title" "creator" "description"]]
[values (map (partial extract-field doc) fields)]
[meta-fields ["calibre:series" "calibre:series_index"]]
[meta-values (map (partial extract-meta doc) meta-fields)]]
(.extend fields meta-fields)
(.extend values meta-values)
(let [[data (dict (zip fields values))]]
(assoc data "path" path
"id" (unicode ( .hexdigest (sha1 path)))
"mtime" (datetime.fromtimestamp (getmtime (join path metadata-file)))
"cover" (get-cover path))
data)))
(defn add-one [writer fields]
(apply .add_document [writer] fields))
; TODO: https://pythonhosted.org/Whoosh/indexing.html#updating-documents
(create-index "data")
(timeit
(let [[index (open-index "data")]
[writer (apply BufferedWriter [index] {"period" 120 "limit" 20})]]
(for [f (gen-metadata (join (get environ "HOME") "Dropbox/Calibre"))]
(print f)
(let [[fields (parse-one f)]]
(add-one writer fields)))
(.close writer)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment