Last active
December 23, 2015 23:29
-
-
Save apage43/6710068 to your computer and use it in GitHub Desktop.
parse USPTO patent assignment XML files (lazily, out of the zips from http://www.google.com/googlebooks/uspto-patents-assignments.html )
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| (ns patentparse | |
| (:import [java.util.zip ZipEntry ZipInputStream]) | |
| (:require [clojure.java.io :as io] | |
| [clojure.string :as st] | |
| [clojure.data.xml :as dxml])) | |
| (def dateformat (java.text.SimpleDateFormat. "yyyyMMdd")) | |
| (defn tag-p? [tag] | |
| (fn [i] (= tag (:tag i)))) | |
| (defn tag1 [intag tag] | |
| (first (filter (tag-p? tag) (:content intag)))) | |
| (defn tag1-> [tag & path] | |
| (reduce tag1 tag path)) | |
| (defn tags [otag tag] | |
| (filter (tag-p? tag) (:content otag))) | |
| (defn txt [tag] | |
| (apply str (filter string? (:content tag)))) | |
| (defn patents [stream] | |
| (let [all-pats (dxml/parse stream)] | |
| (:content | |
| (tag1-> all-pats :patent-assignments)))) | |
| (defn address [atag] | |
| (let [alines (map txt (keep (partial tag1-> atag) | |
| [:address-1 :address-2 :address-3 :address-4 | |
| :city :state :country-name :postcode]))] | |
| (st/join "\n" alines))) | |
| (defn proc-date [odtag] | |
| (let [idtag (tag1-> odtag :date)] | |
| (when idtag | |
| (try (.parse dateformat (txt idtag)) | |
| (catch Exception e nil))))) | |
| (defn maybe-tag | |
| ([stag itag k] | |
| (when-let [txts (seq (:content (tag1-> stag itag)))] | |
| {k (apply str txts)})) | |
| ([stag itag] | |
| (maybe-tag stag itag itag))) | |
| (defn proc-entity [etag] | |
| {:name (txt (tag1-> etag :name)) | |
| :address (address etag)}) | |
| (defn extract [stag & itags] | |
| (apply merge {} | |
| (for [tname itags] | |
| (maybe-tag stag tname)))) | |
| (defn proc-assignor [atag] | |
| (merge (proc-entity atag) | |
| {:execution-date (proc-date (tag1-> atag :execution-date)) | |
| :date-acknowledged (proc-date (tag1-> atag :date-acknowledged))})) | |
| (defn proc [ptag] | |
| (let [arectag (tag1-> ptag :assignment-record) | |
| assignors (:content (tag1-> ptag :patent-assignors)) | |
| assignees (:content (tag1-> ptag :patent-assignees)) | |
| props (:content (tag1-> ptag :patent-properties))] | |
| {:kind :assignment | |
| :assignment | |
| (merge {:reel (txt (tag1-> arectag :reel-no)) | |
| :frame (txt (tag1-> arectag :frame-no)) | |
| :recorded (proc-date (tag1-> arectag :recorded-date)) | |
| :purged (= "Y" (txt (tag1-> arectag :purge-indicator))) | |
| :correspondent (proc-entity (tag1-> arectag :correspondent)) | |
| :conveyance-text (txt (tag1-> arectag :conveyance-text))} | |
| (maybe-tag ptag :page-count)) | |
| :assignors (map proc-assignor assignors) | |
| :assignees (map proc-entity assignees) | |
| :properties | |
| (for [prop props] | |
| {:title (txt (tag1-> prop :invention-title)) | |
| :documents (for [doc (tags prop :document-id)] | |
| (merge (extract doc | |
| :country :doc-number :kind :name) | |
| {:date (proc-date doc)}))})})) | |
| (defn make-id [{{:keys [reel frame recorded]} :assignment}] | |
| (str reel "/" frame "::" (json/decode (json/encode recorded)))) | |
| (defn for-patents [zipfile f] | |
| (with-open [is (io/input-stream zipfile) | |
| zis (ZipInputStream. is)] | |
| (let [entries (take-while identity (repeatedly #(.getNextEntry zis)))] | |
| (doseq [e entries] | |
| (doseq [p (map proc (patents zis))] (f p)) | |
| (.closeEntry zis))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment