Created
May 14, 2012 06:01
-
-
Save ato/2692065 to your computer and use it in GitHub Desktop.
PANDORA display instance to WARC converter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns clj-warc.pandora2warc | |
"PANDORA display instance to WARC converter." | |
(:use clojure.java.io) | |
(:require [clojure.string :as str] | |
[clojure.contrib.seq-utils :as seq-utils]) | |
(:import (org.archive.util Base32) | |
(java.nio.file Path Files SimpleFileVisitor FileVisitResult) | |
(java.nio.file.attribute FileTime BasicFileAttributes)) | |
(:gen-class)) | |
(def ^java.text.SimpleDateFormat date-fmt nil) | |
(defn new-date-fmt [] | |
(doto (java.text.SimpleDateFormat. "yyyy-MM-dd'T'HH:mm:ss'Z'") | |
(.setCalendar (java.util.Calendar/getInstance | |
(java.util.TimeZone/getTimeZone "UTC"))))) | |
(defn decode-mime-line | |
"Decode a line of a mime.types file. Returns a seq of [extension mime] | |
pairs." | |
[s] | |
(let [[mime & extensions] (-> s (str/replace #"#.*" "") ; remove comments | |
(str/split #"\s+"))] | |
(for [ext extensions] [ext mime]))) | |
(defn load-mimes | |
"Construct a hash map of file extension to mime-type." | |
[] | |
(let [lines (-> (clojure.lang.RT/baseLoader) | |
(.getResourceAsStream "mime.types") | |
(reader) | |
(line-seq))] | |
(into {} (mapcat decode-mime-line lines)))) | |
(defn read-panaccess | |
"Read a .panacess-mime.types file and return a map of filenames | |
to mime-types." | |
[f] | |
(let [f (file f)] | |
(when (.exists f) | |
(->> (reader f) | |
(line-seq) | |
(filter #(re-matches #".*(<Files|ForceType) .*" %)) | |
(partition 2) | |
(map (fn [[files ft]] | |
[(second (re-matches #"\s*<Files\s*\"([^\"]+)\">\s*" files)) | |
(second (re-matches #"\s*ForceType\s+(\S*)\s*" ft))])) | |
(into {}))))) | |
(let [cache (atom nil)] | |
(defn read-panaccess-cache | |
"Just like read-panaccess but remembers the last file we read. | |
Dumb cache for tired brain." | |
[f] | |
(let [[fc value] @cache] | |
(if (= fc f) | |
value | |
(second (reset! cache [f (read-panaccess f)])))))) | |
(def mime-types (load-mimes)) | |
(defn mime-for-file [^java.io.File f] | |
(let [ext (last (str/split (str f) #"\.")) | |
pamt (file (.getParent f) ".panaccess-mime.types") | |
pa-mime (get (read-panaccess-cache pamt) (.getName f))] | |
(if pa-mime | |
pa-mime | |
(get mime-types ext "application/octet-stream")))) | |
(defn symlink? [^java.io.File f] | |
(let [canon (if (.getParent f) | |
(file (-> f (.getParentFile) (.getCanonicalFile)) | |
(.getName f)) | |
file)] | |
(not= (.getCanonicalFile canon) (.getAbsoluteFile canon)))) | |
(defn safe-file-seq | |
"A tree seq on java.io.Files, doesn't branch on symbolic links." | |
[dir] | |
(tree-seq | |
(fn [^java.io.File f] (and (not (symlink? f))) (.isDirectory f)) | |
(fn [^java.io.File d] (seq (.listFiles d))) | |
dir)) | |
(defn b32sha1 [^java.security.MessageDigest md] | |
(str "sha1:" (Base32/encode (.digest md)))) | |
(defn sha1-file [f] | |
(let [md (java.security.MessageDigest/getInstance "SHA1") | |
buffer (byte-array (* 1024 1024))] | |
(with-open [is (java.io.FileInputStream. (file f))] | |
(while | |
(let [length (.read is buffer)] | |
(when (pos? length) | |
(.update md buffer 0 length) | |
true)))) | |
(b32sha1 md))) | |
(defn make-header [f #^java.util.Date cdate prefix-size] | |
(try | |
(let [f (file f) | |
path (subs (str f) prefix-size) | |
encoded-path (str/replace (java.net.URLEncoder/encode path) | |
#"%2F" "/")] | |
(str "WARC/1.0\r\n" | |
"WARC-Type: resource\r\n" | |
"WARC-Target-URI: http://pandora.nla.gov.au/pan/" encoded-path "\r\n" | |
"WARC-Record-ID: <urn:uuid:" (java.util.UUID/randomUUID) ">\r\n" | |
"WARC-Date: " (.format date-fmt cdate) "\r\n" | |
"WARC-Block-Digest: " (sha1-file f) "\r\n" | |
"Content-Type: " (mime-for-file f) "\r\n" | |
"Content-Length: " (.length f) "\r\n" | |
"\r\n")) | |
(catch Exception e | |
(println "XXX " cdate) | |
(throw e)))) | |
(defn filetime-to-date [^FileTime ft] | |
(java.util.Date. (.toMillis ft))) | |
(defn happy-find | |
"Walk directory tree returning a list of [file creation-date] pairs. Java 7 version." | |
[dir] | |
(let [path (.toPath (file dir))] | |
(seq-utils/fill-queue | |
(fn [fill] | |
(java.nio.file.Files/walkFileTree | |
path (proxy [java.nio.file.SimpleFileVisitor] [] | |
(visitFile [^Path file ^BasicFileAttributes attribs] | |
(fill [(.toFile file) (filetime-to-date (.creationTime attribs))]) | |
FileVisitResult/CONTINUE))))))) | |
(defn valid-file? [^java.io.File f] | |
(and (.exists f) | |
(not (.isDirectory f)) | |
(not (re-matches #"\.panaccess.*" (.getName f))))) | |
(defn ^java.io.FileOutputStream renaming-output [file] | |
(let [tmp-file (java.io.File. (str file ".new")) | |
fos (proxy [java.io.FileOutputStream] [tmp-file] | |
(close [] (proxy-super close) | |
(.renameTo tmp-file file)))] | |
(.deleteOnExit tmp-file) | |
fos)) | |
(defn write-record [path ctime prefix-size warc] | |
(copy (make-header path ctime prefix-size) warc) | |
(with-open [is (java.io.FileInputStream. ^java.io.File path)] | |
(copy is warc)) | |
(copy "\r\n\r\n" warc)) | |
(defn convert-instance [^java.io.File dest ^java.io.File dir] | |
(binding [date-fmt (new-date-fmt)] | |
(let [pi (-> dir (.getParentFile) (.getName)) | |
timestamp (.getName dir) | |
prefix-size (inc (count (.getParent (.getParentFile dir)))) | |
outfile (file dest (str "nla.arc-" pi "-" timestamp ".warc"))] | |
(with-open [warc (renaming-output outfile)] | |
(doseq [[path ctime] (happy-find dir) | |
:when (valid-file? path)] | |
(write-record path ctime prefix-size warc)))))) | |
(defn -main [dest & args] | |
(dorun (pmap #(convert-instance (file dest) (file %)) args)) | |
(shutdown-agents)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment