Created
August 4, 2014 14:26
-
-
Save rcarmo/a8904e51779302431c54 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(import [os [walk environ]] | |
[os.path [dirname exists join getmtime]] | |
[datetime [datetime]] | |
[time [time]] | |
[whoosh.writing [BufferedWriter AsyncWriter]] | |
[whoosh.index [create-in open-dir]] | |
[whoosh.fields [Schema TEXT STORED ID KEYWORD NUMERIC DATETIME]] | |
[xml.etree [cElementTree]] | |
[bs4 [BeautifulSoup]] | |
[hashlib [sha1]] | |
[PIL [Image]] | |
[StringIO [StringIO]] | |
[functools [partial]]) | |
(defmacro timeit [block] | |
`(let [[t (time)]] | |
~block | |
(print (- (time) t)))) | |
(def metadata-file "metadata.opf") | |
(def thumbnail-size (, 128 128)) | |
(defn create-index [path] | |
(let [[schema (apply Schema [] | |
{"id" (apply ID [] {"stored" true "unique" true}) | |
"path" (apply TEXT [] {"stored" true}) | |
"mtime" (apply DATETIME [] {"stored" true "sortable" true}) | |
"title" (apply TEXT [] {"stored" true "sortable" true}) | |
"creator" (apply TEXT [] {"stored" true "sortable" true}) | |
"calibre:series" (apply TEXT [] {"stored" true "sortable" true}) | |
"calibre:series_index" (apply TEXT [] {"stored" true "sortable" true}) | |
"description" (apply TEXT [] {"stored" true}) | |
"cover" STORED})]] | |
(create-in path schema))) | |
(defn open-index [path] | |
(open-dir path)) | |
(defn gen-metadata [path] | |
; provides a sequence of metadata files to parse | |
(for [loc (walk path)] | |
(if (in metadata-file (get loc 2)) | |
(yield (get loc 0))))) | |
(defn plaintext [buffer] | |
; flattens HTML into plaintext | |
(.get-text (BeautifulSoup buffer))) | |
(defn extract-field [element field] | |
; extracts a given field out of an XML tree | |
(let [[ns "{http://purl.org/dc/elements/1.1/}"]] | |
(try | |
(plaintext (.next (.itertext (get (.findall element (.format ".//{0}{1}" ns field)) 0)))) | |
(catch [e Exception] | |
nil)))) | |
(defn extract-meta [element field] | |
; extracts a metadata attribute out of an XML tree | |
(try | |
(get (get (.findall element (.format ".//meta[@name='{0}']")) 0) "content") | |
(catch [e Exception] | |
nil))) | |
(defn get-cover [path] | |
; generate a smaller thumbnail from the on-disk cover | |
(let [[cover (join path "cover.jpg")]] | |
(if (exists cover) | |
(let [[im (.open Image cover)] | |
[buffer (StringIO)]] | |
(.thumbnail im thumbnail-size Image.ANTIALIAS) | |
(apply .save [im buffer "JPEG"] {"quality" (int 75) | |
"optimize" true | |
"progressive" true}) | |
(.getvalue buffer))))) | |
(defn parse-one [path] | |
; parse a metadata file and return the fields we want to index | |
(let [[doc (.getroot (.parse cElementTree (open (join path metadata-file) "r")))] | |
[fields ["title" "creator" "description"]] | |
[values (map (partial extract-field doc) fields)] | |
[meta-fields ["calibre:series" "calibre:series_index"]] | |
[meta-values (map (partial extract-meta doc) meta-fields)]] | |
(.extend fields meta-fields) | |
(.extend values meta-values) | |
(let [[data (dict (zip fields values))]] | |
(assoc data "path" path | |
"id" (unicode ( .hexdigest (sha1 path))) | |
"mtime" (datetime.fromtimestamp (getmtime (join path metadata-file))) | |
"cover" (get-cover path)) | |
data))) | |
(defn add-one [writer fields] | |
(apply .add_document [writer] fields)) | |
; TODO: https://pythonhosted.org/Whoosh/indexing.html#updating-documents | |
(create-index "data") | |
(timeit | |
(let [[index (open-index "data")] | |
[writer (apply BufferedWriter [index] {"period" 120 "limit" 20})]] | |
(for [f (gen-metadata (join (get environ "HOME") "Dropbox/Calibre"))] | |
(print f) | |
(let [[fields (parse-one f)]] | |
(add-one writer fields))) | |
(.close writer))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment