Skip to content

Instantly share code, notes, and snippets.

@apage43
Last active December 23, 2015 23:29
Show Gist options
  • Save apage43/6710068 to your computer and use it in GitHub Desktop.
Save apage43/6710068 to your computer and use it in GitHub Desktop.
parse USPTO patent assignment XML files (lazily, out of the zips from http://www.google.com/googlebooks/uspto-patents-assignments.html )
(ns patentparse
(:import [java.util.zip ZipEntry ZipInputStream])
(:require [clojure.java.io :as io]
[clojure.string :as st]
[clojure.data.xml :as dxml]))
(def dateformat (java.text.SimpleDateFormat. "yyyyMMdd"))
(defn tag-p? [tag]
(fn [i] (= tag (:tag i))))
(defn tag1 [intag tag]
(first (filter (tag-p? tag) (:content intag))))
(defn tag1-> [tag & path]
(reduce tag1 tag path))
(defn tags [otag tag]
(filter (tag-p? tag) (:content otag)))
(defn txt [tag]
(apply str (filter string? (:content tag))))
(defn patents [stream]
(let [all-pats (dxml/parse stream)]
(:content
(tag1-> all-pats :patent-assignments))))
(defn address [atag]
(let [alines (map txt (keep (partial tag1-> atag)
[:address-1 :address-2 :address-3 :address-4
:city :state :country-name :postcode]))]
(st/join "\n" alines)))
(defn proc-date [odtag]
(let [idtag (tag1-> odtag :date)]
(when idtag
(try (.parse dateformat (txt idtag))
(catch Exception e nil)))))
(defn maybe-tag
([stag itag k]
(when-let [txts (seq (:content (tag1-> stag itag)))]
{k (apply str txts)}))
([stag itag]
(maybe-tag stag itag itag)))
(defn proc-entity [etag]
{:name (txt (tag1-> etag :name))
:address (address etag)})
(defn extract [stag & itags]
(apply merge {}
(for [tname itags]
(maybe-tag stag tname))))
(defn proc-assignor [atag]
(merge (proc-entity atag)
{:execution-date (proc-date (tag1-> atag :execution-date))
:date-acknowledged (proc-date (tag1-> atag :date-acknowledged))}))
(defn proc [ptag]
(let [arectag (tag1-> ptag :assignment-record)
assignors (:content (tag1-> ptag :patent-assignors))
assignees (:content (tag1-> ptag :patent-assignees))
props (:content (tag1-> ptag :patent-properties))]
{:kind :assignment
:assignment
(merge {:reel (txt (tag1-> arectag :reel-no))
:frame (txt (tag1-> arectag :frame-no))
:recorded (proc-date (tag1-> arectag :recorded-date))
:purged (= "Y" (txt (tag1-> arectag :purge-indicator)))
:correspondent (proc-entity (tag1-> arectag :correspondent))
:conveyance-text (txt (tag1-> arectag :conveyance-text))}
(maybe-tag ptag :page-count))
:assignors (map proc-assignor assignors)
:assignees (map proc-entity assignees)
:properties
(for [prop props]
{:title (txt (tag1-> prop :invention-title))
:documents (for [doc (tags prop :document-id)]
(merge (extract doc
:country :doc-number :kind :name)
{:date (proc-date doc)}))})}))
(defn make-id [{{:keys [reel frame recorded]} :assignment}]
(str reel "/" frame "::" (json/decode (json/encode recorded))))
(defn for-patents [zipfile f]
(with-open [is (io/input-stream zipfile)
zis (ZipInputStream. is)]
(let [entries (take-while identity (repeatedly #(.getNextEntry zis)))]
(doseq [e entries]
(doseq [p (map proc (patents zis))] (f p))
(.closeEntry zis)))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment