Last active
February 21, 2018 10:34
-
-
Save jackrusher/66b0690f76c3ed65ac4fb271e1640e91 to your computer and use it in GitHub Desktop.
An example of parsing MARC21 for PEF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns marc21-reader | |
(:require [nio.core :as nio] ; only for mmap | |
[clojure.string :as string] | |
[hickory.core :as hc] | |
[hickory.select :as hs])) | |
(def ^:dynamic *current-encoding* "ISO-8859-1") ; changes to UTF-8 as needed | |
;; I'm old, tired and lazy, so I'd | |
;; spider the format description | |
;; files and extract the data into | |
;; local lookup tables to allow me | |
;; to interactively query the | |
;; records with human readable | |
;; keywords. | |
;; https://www.loc.gov/marc/bibliographic/bdsummary.html | |
(def field-names | |
(->> (slurp "/Users/jack/tmp/for-paul/marc-field-types.html") | |
hc/parse | |
hc/as-hickory | |
(hs/select (hs/child (hs/class "summarytable"))) | |
(map #(map (comp first :content) (hs/select (hs/attr :width) %))) | |
(remove (comp map? second)) | |
(reduce (fn [m e] | |
(let [[tag meaning] e] | |
(assoc m tag (-> (string/trim meaning) | |
string/lower-case | |
(string/replace #"[^a-z]+" "-") | |
keyword)))) | |
{}))) | |
(defn extract-subfield-names [filename] | |
(->> (slurp filename) | |
hc/parse | |
hc/as-hickory | |
(hs/select (hs/child (hs/find-in-text #"^\$"))) | |
(filter #(#{:li :td} (:tag %))) | |
(mapcat :content) | |
(remove map?) | |
(reduce (fn [m e] | |
(let [[tag meaning] (string/split e #"-")] | |
(assoc m (keyword (subs tag 1 2)) | |
(-> (string/trim meaning) | |
string/lower-case | |
(string/replace #" \(.*\)$" "") | |
(string/replace #"[^a-z]+" "-") | |
keyword)))) | |
{}))) | |
;; N.B. We'd need to read in tables for each of the field-names | |
;; above for this to be a general solution. | |
(def subfield-names | |
{;; http://www.loc.gov/marc/bibliographic/bdx00.html | |
"100" (extract-subfield-names "/Users/jack/tmp/for-paul/personal-name-subfields.html") | |
;; https://www.loc.gov/marc/bibliographic/bd245.html | |
"245" (extract-subfield-names "/Users/jack/tmp/for-paul/title-statement.html")}) | |
(defn parse-shape [shape incoming] | |
(reduce | |
(fn [m [field len parse-fn]] | |
(let [buf (byte-array len)] | |
(.get incoming buf) | |
(if field | |
(assoc m field (let [out (String. buf *current-encoding*)] | |
(if parse-fn (parse-fn out) out))) | |
m))) | |
{} | |
shape)) | |
(def leader-shape | |
[[:length 5 #(Integer/parseInt %)] | |
[:status 1] | |
[:record-type 1] | |
[:biblio-level 1] | |
[:type-of-control 1] | |
[:encoding 1 #(when (= % \a) "UTF-8")] ; a=unicode, nil=default | |
[:indicator-count 1 #(Integer/parseInt %)] | |
[:subfield-code-count 1 #(Integer/parseInt %)] | |
[:base-address 5 #(Integer/parseInt %)] | |
[:encoding-level 1] | |
[:descriptive-form 1] | |
[:multipart-level 1] | |
[:length-of-length 1 #(Integer/parseInt %)] | |
[:length-of-start 1 #(Integer/parseInt %)] | |
[:junk-in-the-trunk 1] ; implementation defined | |
[:undefined 1]]) | |
(def dir-entry-shape | |
[[:tag 3] | |
[:field-length 4 #(Integer/parseInt %)] | |
[:start-pos 5 #(Integer/parseInt %)]]) | |
(defn control-field? [tag] | |
(< (Integer/parseInt tag) 10)) | |
(defn parse-data [tag raw] | |
{:indicator (subs (:raw-field raw) 1 3) | |
:subfields (->> (clojure.string/split (subs (:raw-field raw) 3) (re-pattern (str (char 31)))) | |
(remove empty?) | |
(reduce #(assoc %1 | |
(or (get-in subfield-names [tag (keyword (subs %2 0 1))]) tag) | |
(subs %2 1)) {}))}) | |
(defn parse-record [incoming] | |
(let [leader (parse-shape leader-shape incoming) ;; for record length and encoding, really | |
dir-entry-count (/ (- (:base-address leader) 25) 12)] | |
(binding [*current-encoding* (if-let [enc (:encoding leader)] enc *current-encoding*)] | |
(let [fields (reduce (fn [m entry] | |
(let [tag (:tag entry)] | |
(if (control-field? tag) | |
(assoc m (field-names tag) | |
(parse-shape [[:control-fields (:field-length entry)]] incoming)) | |
(assoc m (field-names tag) | |
(parse-data tag (parse-shape [[:raw-field (:field-length entry)]] incoming)))))) | |
{} | |
(into [] (repeatedly dir-entry-count #(parse-shape dir-entry-shape incoming))))] | |
(parse-shape [[:tail 2]] incoming) ;; eat the last two bytes | |
fields)))) | |
(def example-records | |
(let [in (nio/mmap "/Users/jack/tmp/for-paul/PGA-Australiana.mrc")] | |
(into [] (repeatedly 270 (partial parse-record in))))) | |
(first example-records) | |
;; {:fixed-length-data-elements {:control-fields "070529s9999 xx 000 0 und d"}, | |
;; :main-entry-personal-name | |
;; {:indicator "1 ", :subfields {:personal-name "Amundsen, Roald,", :dates-associated-with-a-name "1872-1928."}}, | |
;; :title-statement {:indicator "14", :subfields {:title "The South Pole", :medium "[electronic resource]."}}, | |
;; :general-note {:indicator " ", :subfields {"500" "An ebook provided by Project Gutenberg Australia."}}, | |
;; :electronic-location-and-access {:indicator "40", :subfields {"856" "http://gutenberg.net.au/ebooks/e00111.txt "}}} | |
;; Let's get some by-author counts | |
(->> example-records | |
(keep #(some-> % :main-entry-personal-name :subfields :personal-name)) | |
frequencies | |
(sort-by second >)) | |
;;=> | |
;; ["Lawson, Henry," 15] | |
;; ["Paterson, A. B." 11] | |
;; ["Dyson, Edward," 9] | |
;; ["Dennis, C. J." 9] | |
;; ["Richardson, Henry Handel," 8] | |
;; ["Clarke, Marcus," 7] | |
;; ["Becke, Louis," 6] | |
;; ["Praed, Campbell," 6] | |
;; ["Scott, Ernest," 5] | |
;; ["Gaunt, Mary," 5] | |
;; how about titles of works by Mary Gaunt? | |
(->> example-records | |
(filter #(= "Gaunt, Mary," (some-> % :main-entry-personal-name :subfields :personal-name))) | |
(map #(-> % :title-statement :subfields :title))) | |
;; => ("The Moving Finger" "Kirkham's Find" "End of the Earth" "The Doctor's Drive" "Dave's Sweetheart") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment