Last active
March 27, 2025 13:09
-
-
Save jscrane/de6101ae12b8e1e3528534a572b1d5a9 to your computer and use it in GitHub Desktop.
Clojure script to parse pdf files containing health test data into csv
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/sh | |
| "exec" "/usr/local/bin/clojure" "-Sdeps" '{:deps,{org.apache.pdfbox/pdfbox,{:mvn/version,"1.8.2"},org.clojure/data.csv,{:mvn/version,"1.1.0"}}}' -M "$0" "$@" | |
| (ns bloods | |
| (:require [clojure.data.csv :as csv] | |
| [clojure.string :as str]) | |
| (:import (java.io StringWriter) | |
| (org.apache.pdfbox.pdmodel PDDocument) | |
| (org.apache.pdfbox.util PDFTextStripper))) | |
| (def metadata | |
| ["prescribing doctor" | |
| "date requested" | |
| "hospital:" | |
| "number of the sample"]) | |
| (def test-names | |
| { | |
| "NA:Sodium" "Sodium" | |
| "sodium" "Sodium" | |
| "K:Potassium" "Potassium" | |
| "potassium" "Potassium" | |
| "CL:Chloride" "Chloride" | |
| "chlorides" "Chloride" | |
| "calcium" "Calcium" | |
| "phosphorous" "Phosphorous" | |
| "magnesium" "Magnesium" | |
| "UREA:Urea" "Urea" | |
| "urea" "Urea" | |
| "CRE:Creatinine" "Creatinine" | |
| "creatinine" "Creatinine" | |
| "ALB:Albumin" "Albumin" | |
| "albumin" "Albumin" | |
| "BIL:Total Bilirubin" "Bilirubin (total)" | |
| "bilirubin (total)" "Bilirubin (total)" | |
| "ALP:Alk.P,tase" "Alkaline Phosphatase" | |
| "alkaline phosphatase" "Alkaline Phosphatase" | |
| "GGT:Gamma G T" "Gamma GT" | |
| "gamma-GT" "Gamma GT" | |
| "ALT:ALT" "ALT (GPT)" | |
| "ALT (GPT)" "ALT (GPT)" | |
| "GLU:Glucose" "Glucose" | |
| "blood sugar" "Glucose" | |
| "CHOL:Total Cholesterol" "Cholesterol (total)" | |
| "cholesterol (total)" "Cholesterol (total)" | |
| "triglycerides" "Triglycerides" | |
| "TG:Triglyceride" "Triglycerides" | |
| "HDLC:HDL Cholesterol" "HDL Cholesterol" | |
| "HDL Cholesterol" "HDL Cholesterol" | |
| "cholesterol (HDL)" "HDL Cholesterol" | |
| "Non-HDL cholesterol" "Non-HDL Cholesterol" | |
| "NHDLC:Non-HDL cholesterol" "Non-HDL Cholesterol" | |
| "LDL cholesterol" "LDL Cholesterol" | |
| "LDL Cholesterol (Calculated)" "LDL Cholesterol" | |
| "LDLC:LDL Cholesterol" "LDL Cholesterol" | |
| "LDLC:LDL Cholesterol (Calculated)" "LDL Cholesterol" | |
| "T.Chol/HDL Ratio" "Total Cholesterol/HDL Ratio" | |
| "Chol/HDL" "Total Cholesterol/HDL Ratio" | |
| "CHOL/HDL" "Total Cholesterol/HDL Ratio" | |
| "TCHDR:T.Chol/HDL Ratio" "Total Cholesterol/HDL Ratio" | |
| "TSH:TSH" "TSH" | |
| "TSH" "TSH" | |
| "CRP:C Reactive Protein" "C Reactive Protein" | |
| "C reactive protein" "C Reactive Protein" | |
| "vit B12" "Vitamin B12" | |
| "B12:Vitamin B12" "B12:Vitamin B12" | |
| "FOL:Folate" "Folate" | |
| "folate" "Folate" | |
| "FER:Ferritin" "Ferritin" | |
| "ferritin" "Ferritin" | |
| "PSA total" "PSA" | |
| "White Cell Count" "White Cell Count" | |
| "WCC:White Cell Count" "White Cell Count" | |
| "leucocytes [WBC]" "White Cell Count" | |
| "HB:Haemoglobin" "Haemoglobin" | |
| "haemoglobin" "Haemoglobin" | |
| "platelets" "Platelets" | |
| "PLT:Platelets" "Platelets" | |
| "RBC" "Red Cell Count" | |
| "RCC:Red Cell Count" "Red Cell Count" | |
| "Red Cell Count" "Red Cell Count" | |
| "HCT:Haematocrit" "Haematocrit" | |
| "haematocrit" "Haematocrit" | |
| "MCV:Mean Cell Volume" "Mean Cell Volume" | |
| "MCV" "Mean Cell Volume" | |
| "MCH" "Mean Cell Haemoglobin" | |
| "MCH:Mean Cell Haemoglobin" "Mean Cell Haemoglobin" | |
| "RDW:RDW" "Red Cell Distribution Width" | |
| "Red Cell Distribution Width [RDW]" "Red Cell Distribution Width" | |
| "neutrophils" "Neutrophils" | |
| "NEUTA:Neutrophils" "Neutrophils" | |
| "lymphocytes" "Lymphocytes" | |
| "LYMA:Lymphocytes" "Lymphocytes" | |
| "monocytes" "Monocytes" | |
| "MONOA:Monocytes" "Monocytes" | |
| "eosinophils" "Eosinophils" | |
| "EOSA:Eosinophils" "Eosinophils" | |
| "basophils" "Basophils" | |
| "BASA:Basophils" "Basophils" | |
| "Nucleated RBC's." "Nucleated RBCs" | |
| "25(OH)Vitamin D" "Vitamin D" | |
| "globulin" "Globulin" | |
| "eGFR" "eGFR" | |
| "total proteins" "Total Proteins" | |
| "HbA1c HPLC (EDTA)" "HbA1c HPLC (EDTA)" | |
| "FE:Serum Iron" "Serum Iron" | |
| "TRFN:Serum Transferrin" "Serum Transferrin" | |
| "TIBC/C:TIBC (Calculated)" "TIBC" | |
| "TSAT:% Transferrin Satn." "Transferrin Saturation %" | |
| }) | |
| (def tests (keys test-names)) | |
| (defn text-of-pdf | |
| [^String file] | |
| (with-open [pd (PDDocument/load file)] | |
| (let [stripper (PDFTextStripper.)] | |
| (str/split (.getText stripper pd) #"\n")))) | |
| (defn as-test [line] | |
| (let [[test data] (str/split line #": ") | |
| [result units range] (str/split (str/trim data) #" ")] | |
| (sorted-map :test (test-names test) :test-result result | |
| :test-units (when (not= units "-") units) | |
| :test-result-range (when (not= range "-") range)))) | |
| (defn as-meta [lines] | |
| (->> lines | |
| (map (fn [x] | |
| (let [[name data] (str/split x #": ") | |
| [data] (str/split data #",")] | |
| [(keyword name) (str/trim data)]))) | |
| (reduce conj (sorted-map)))) | |
| (defn starting-with? [coll] | |
| (fn [line] (some (fn [t] (str/starts-with? line t)) coll))) | |
| (defn read-file [file] | |
| (let [text (->> file text-of-pdf (filter (fn [line] (str/includes? line ":")))) | |
| metadata (->> text (filter (starting-with? metadata)) as-meta (partial merge))] | |
| (->> text | |
| (filter (starting-with? tests)) | |
| (map (comp metadata as-test))))) | |
| (defn write-csv [tests] | |
| (let [hdr (map name (keys (first tests)))] | |
| (with-open [s (StringWriter.)] | |
| (->> tests | |
| (map (fn [line] (vec (vals line)))) | |
| (reduce conj [hdr]) | |
| (csv/write-csv s)) | |
| (str s)))) | |
| (println (->> *command-line-args* | |
| (mapcat read-file) | |
| write-csv)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment