jackrusher · February 21, 2018 10:34
diff --git a/marc21.clj b/marc21.clj
 (ns marc21-reader
  (:require [nio.core :as nio]       ; only for mmap
            [clojure.string :as string]
            [hickory.core :as hc]
            [hickory.select :as hs]))

 (def ^:dynamic *current-encoding* "ISO-8859-1") ; changes to UTF-8 as needed

 ;; I'm old, tired and lazy, so I'd
 ;; spider the format description
 ;; files and extract the data into
 ;; local lookup tables to allow me
 ;; to interactively query the
 ;; records with human readable
 ;; keywords.

 ;; https://www.loc.gov/marc/bibliographic/bdsummary.html
 (def field-names
  (->> (slurp "/Users/jack/tmp/for-paul/marc-field-types.html")
       hc/parse
       hc/as-hickory
       (hs/select (hs/child (hs/class "summarytable")))
       (map #(map (comp first :content) (hs/select (hs/attr :width) %)))
       (remove (comp map? second))
       (reduce (fn [m e]
                 (let [[tag meaning] e]
                   (assoc m tag (-> (string/trim meaning)
                                    string/lower-case
                                    (string/replace #"[^a-z]+" "-")
                                    keyword))))
               {})))

 (defn extract-subfield-names [filename]
  (->> (slurp filename)
       hc/parse
       hc/as-hickory
       (hs/select (hs/child (hs/find-in-text #"^\$")))
       (filter #(#{:li :td} (:tag %)))
       (mapcat :content)
       (remove map?)
       (reduce (fn [m e]
                 (let [[tag meaning] (string/split e #"-")]
                   (assoc m (keyword (subs tag 1 2))
                          (-> (string/trim meaning)
                              string/lower-case
                              (string/replace #" \(.*\)$" "")
                              (string/replace #"[^a-z]+" "-")
                              keyword))))
               {})))

 ;; N.B. We'd need to read in tables for each of the field-names
 ;; above for this to be a general solution.
 (def subfield-names
  {;; http://www.loc.gov/marc/bibliographic/bdx00.html
   "100" (extract-subfield-names "/Users/jack/tmp/for-paul/personal-name-subfields.html")
   ;; https://www.loc.gov/marc/bibliographic/bd245.html
   "245" (extract-subfield-names "/Users/jack/tmp/for-paul/title-statement.html")})

 (defn parse-shape [shape incoming]
  (reduce
   (fn [m [field len parse-fn]]
     (let [buf (byte-array len)]
       (.get incoming buf)
       (if field
         (assoc m field (let [out (String. buf *current-encoding*)]
                          (if parse-fn (parse-fn out) out)))
         m)))
   {}
   shape))

 (def leader-shape
  [[:length              5 #(Integer/parseInt %)]
   [:status              1]
   [:record-type         1]
   [:biblio-level        1]
   [:type-of-control     1]
   [:encoding            1 #(when (= % \a) "UTF-8")] ; a=unicode, nil=default
   [:indicator-count     1 #(Integer/parseInt %)]
   [:subfield-code-count 1 #(Integer/parseInt %)]
   [:base-address        5 #(Integer/parseInt %)]
   [:encoding-level      1]
   [:descriptive-form    1]
   [:multipart-level     1]
   [:length-of-length    1 #(Integer/parseInt %)]
   [:length-of-start     1 #(Integer/parseInt %)]
   [:junk-in-the-trunk   1] ; implementation defined
   [:undefined           1]])

 (def dir-entry-shape
  [[:tag          3]
   [:field-length 4 #(Integer/parseInt %)]
   [:start-pos    5 #(Integer/parseInt %)]])

 (defn control-field? [tag]
  (< (Integer/parseInt tag) 10))

 (defn parse-data [tag raw]
  {:indicator (subs (:raw-field raw) 1 3)
   :subfields (->> (clojure.string/split (subs (:raw-field raw) 3) (re-pattern (str (char 31))))
                   (remove empty?)
                   (reduce #(assoc %1
                                   (or (get-in subfield-names [tag (keyword (subs %2 0 1))]) tag)
                                   (subs %2 1)) {}))})

 (defn parse-record [incoming]
  (let [leader (parse-shape leader-shape incoming) ;; for record length and encoding, really
        dir-entry-count (/ (- (:base-address leader) 25) 12)]
    (binding [*current-encoding* (if-let [enc (:encoding leader)] enc *current-encoding*)]
      (let [fields (reduce (fn [m entry]
                             (let [tag (:tag entry)]
                               (if (control-field? tag)
                                 (assoc m (field-names tag)
                                        (parse-shape [[:control-fields (:field-length entry)]] incoming))
                                 (assoc m (field-names tag)
                                        (parse-data tag (parse-shape [[:raw-field (:field-length entry)]] incoming))))))
                           {}
                           (into [] (repeatedly dir-entry-count #(parse-shape dir-entry-shape incoming))))]
        (parse-shape [[:tail 2]] incoming) ;; eat the last two bytes
        fields))))

 (def example-records
  (let [in (nio/mmap "/Users/jack/tmp/for-paul/PGA-Australiana.mrc")]
    (into [] (repeatedly 270 (partial parse-record in)))))

 (first example-records)
 ;; {:fixed-length-data-elements {:control-fields "070529s9999    xx            000 0 und d"},
 ;;  :main-entry-personal-name
 ;;  {:indicator "1 ", :subfields {:personal-name "Amundsen, Roald,", :dates-associated-with-a-name "1872-1928."}},
 ;;  :title-statement {:indicator "14", :subfields {:title "The South Pole", :medium "[electronic resource]."}},
 ;;  :general-note {:indicator "  ", :subfields {"500" "An ebook provided by Project Gutenberg Australia."}},
 ;;  :electronic-location-and-access {:indicator "40", :subfields {"856" "http://gutenberg.net.au/ebooks/e00111.txt "}}}

 ;; Let's get some by-author counts
 (->> example-records
     (keep #(some-> % :main-entry-personal-name :subfields :personal-name))
     frequencies
     (sort-by second >))
 ;;=>
 ;; ["Lawson, Henry," 15]
 ;; ["Paterson, A. B." 11]
 ;; ["Dyson, Edward," 9]
 ;; ["Dennis, C. J." 9]
 ;; ["Richardson, Henry Handel," 8]
 ;; ["Clarke, Marcus," 7]
 ;; ["Becke, Louis," 6]
 ;; ["Praed, Campbell," 6]
 ;; ["Scott, Ernest," 5]
 ;; ["Gaunt, Mary," 5]

 ;; how about titles of works by Mary Gaunt?
 (->> example-records
     (filter #(= "Gaunt, Mary," (some-> % :main-entry-personal-name :subfields :personal-name)))
     (map #(-> % :title-statement :subfields :title)))
 ;; => ("The Moving Finger" "Kirkham's Find" "End of the Earth" "The Doctor's Drive" "Dave's Sweetheart")
	(ns marc21-reader
	(:require [nio.core :as nio] ; only for mmap
	[clojure.string :as string]
	[hickory.core :as hc]
	[hickory.select :as hs]))

	(def ^:dynamic current-encoding "ISO-8859-1") ; changes to UTF-8 as needed

	;; I'm old, tired and lazy, so I'd
	;; spider the format description
	;; files and extract the data into
	;; local lookup tables to allow me
	;; to interactively query the
	;; records with human readable
	;; keywords.

	;; https://www.loc.gov/marc/bibliographic/bdsummary.html
	(def field-names
	(->> (slurp "/Users/jack/tmp/for-paul/marc-field-types.html")
	hc/parse
	hc/as-hickory
	(hs/select (hs/child (hs/class "summarytable")))
	(map #(map (comp first :content) (hs/select (hs/attr :width) %)))
	(remove (comp map? second))
	(reduce (fn [m e]
	(let [[tag meaning] e]
	(assoc m tag (-> (string/trim meaning)
	string/lower-case
	(string/replace #"[^a-z]+" "-")
	keyword))))
	{})))

	(defn extract-subfield-names [filename]
	(->> (slurp filename)
	hc/parse
	hc/as-hickory
	(hs/select (hs/child (hs/find-in-text #"^\$")))
	(filter #(#{:li :td} (:tag %)))
	(mapcat :content)
	(remove map?)
	(reduce (fn [m e]
	(let [[tag meaning] (string/split e #"-")]
	(assoc m (keyword (subs tag 1 2))
	(-> (string/trim meaning)
	string/lower-case
	(string/replace #" \(.*\)$" "")
	(string/replace #"[^a-z]+" "-")
	keyword))))
	{})))

	;; N.B. We'd need to read in tables for each of the field-names
	;; above for this to be a general solution.
	(def subfield-names
	{;; http://www.loc.gov/marc/bibliographic/bdx00.html
	"100" (extract-subfield-names "/Users/jack/tmp/for-paul/personal-name-subfields.html")
	;; https://www.loc.gov/marc/bibliographic/bd245.html
	"245" (extract-subfield-names "/Users/jack/tmp/for-paul/title-statement.html")})

	(defn parse-shape [shape incoming]
	(reduce
	(fn [m [field len parse-fn]]
	(let [buf (byte-array len)]
	(.get incoming buf)
	(if field
	(assoc m field (let [out (String. buf current-encoding)]
	(if parse-fn (parse-fn out) out)))
	m)))
	{}
	shape))

	(def leader-shape
	[[:length 5 #(Integer/parseInt %)]
	[:status 1]
	[:record-type 1]
	[:biblio-level 1]
	[:type-of-control 1]
	[:encoding 1 #(when (= % \a) "UTF-8")] ; a=unicode, nil=default
	[:indicator-count 1 #(Integer/parseInt %)]
	[:subfield-code-count 1 #(Integer/parseInt %)]
	[:base-address 5 #(Integer/parseInt %)]
	[:encoding-level 1]
	[:descriptive-form 1]
	[:multipart-level 1]
	[:length-of-length 1 #(Integer/parseInt %)]
	[:length-of-start 1 #(Integer/parseInt %)]
	[:junk-in-the-trunk 1] ; implementation defined
	[:undefined 1]])

	(def dir-entry-shape
	[[:tag 3]
	[:field-length 4 #(Integer/parseInt %)]
	[:start-pos 5 #(Integer/parseInt %)]])

	(defn control-field? [tag]
	(< (Integer/parseInt tag) 10))

	(defn parse-data [tag raw]
	{:indicator (subs (:raw-field raw) 1 3)
	:subfields (->> (clojure.string/split (subs (:raw-field raw) 3) (re-pattern (str (char 31))))
	(remove empty?)
	(reduce #(assoc %1
	(or (get-in subfield-names [tag (keyword (subs %2 0 1))]) tag)
	(subs %2 1)) {}))})

	(defn parse-record [incoming]
	(let [leader (parse-shape leader-shape incoming) ;; for record length and encoding, really
	dir-entry-count (/ (- (:base-address leader) 25) 12)]
	(binding [current-encoding (if-let [enc (:encoding leader)] enc current-encoding)]
	(let [fields (reduce (fn [m entry]
	(let [tag (:tag entry)]
	(if (control-field? tag)
	(assoc m (field-names tag)
	(parse-shape [[:control-fields (:field-length entry)]] incoming))
	(assoc m (field-names tag)
	(parse-data tag (parse-shape [[:raw-field (:field-length entry)]] incoming))))))
	{}
	(into [] (repeatedly dir-entry-count #(parse-shape dir-entry-shape incoming))))]
	(parse-shape [[:tail 2]] incoming) ;; eat the last two bytes
	fields))))

	(def example-records
	(let [in (nio/mmap "/Users/jack/tmp/for-paul/PGA-Australiana.mrc")]
	(into [] (repeatedly 270 (partial parse-record in)))))

	(first example-records)
	;; {:fixed-length-data-elements {:control-fields "070529s9999 xx 000 0 und d"},
	;; :main-entry-personal-name
	;; {:indicator "1 ", :subfields {:personal-name "Amundsen, Roald,", :dates-associated-with-a-name "1872-1928."}},
	;; :title-statement {:indicator "14", :subfields {:title "The South Pole", :medium "[electronic resource]."}},
	;; :general-note {:indicator " ", :subfields {"500" "An ebook provided by Project Gutenberg Australia."}},
	;; :electronic-location-and-access {:indicator "40", :subfields {"856" "http://gutenberg.net.au/ebooks/e00111.txt "}}}

	;; Let's get some by-author counts
	(->> example-records
	(keep #(some-> % :main-entry-personal-name :subfields :personal-name))
	frequencies
	(sort-by second >))
	;;=>
	;; ["Lawson, Henry," 15]
	;; ["Paterson, A. B." 11]
	;; ["Dyson, Edward," 9]
	;; ["Dennis, C. J." 9]
	;; ["Richardson, Henry Handel," 8]
	;; ["Clarke, Marcus," 7]
	;; ["Becke, Louis," 6]
	;; ["Praed, Campbell," 6]
	;; ["Scott, Ernest," 5]
	;; ["Gaunt, Mary," 5]

	;; how about titles of works by Mary Gaunt?
	(->> example-records
	(filter #(= "Gaunt, Mary," (some-> % :main-entry-personal-name :subfields :personal-name)))
	(map #(-> % :title-statement :subfields :title)))
	;; => ("The Moving Finger" "Kirkham's Find" "End of the Earth" "The Doctor's Drive" "Dave's Sweetheart")