Skip to content

Instantly share code, notes, and snippets.

@jscrane
Last active March 27, 2025 13:09
Show Gist options
  • Select an option

  • Save jscrane/de6101ae12b8e1e3528534a572b1d5a9 to your computer and use it in GitHub Desktop.

Select an option

Save jscrane/de6101ae12b8e1e3528534a572b1d5a9 to your computer and use it in GitHub Desktop.
Clojure script to parse pdf files containing health test data into csv
#!/bin/sh
"exec" "/usr/local/bin/clojure" "-Sdeps" '{:deps,{org.apache.pdfbox/pdfbox,{:mvn/version,"1.8.2"},org.clojure/data.csv,{:mvn/version,"1.1.0"}}}' -M "$0" "$@"
(ns bloods
(:require [clojure.data.csv :as csv]
[clojure.string :as str])
(:import (java.io StringWriter)
(org.apache.pdfbox.pdmodel PDDocument)
(org.apache.pdfbox.util PDFTextStripper)))
(def metadata
["prescribing doctor"
"date requested"
"hospital:"
"number of the sample"])
(def test-names
{
"NA:Sodium" "Sodium"
"sodium" "Sodium"
"K:Potassium" "Potassium"
"potassium" "Potassium"
"CL:Chloride" "Chloride"
"chlorides" "Chloride"
"calcium" "Calcium"
"phosphorous" "Phosphorous"
"magnesium" "Magnesium"
"UREA:Urea" "Urea"
"urea" "Urea"
"CRE:Creatinine" "Creatinine"
"creatinine" "Creatinine"
"ALB:Albumin" "Albumin"
"albumin" "Albumin"
"BIL:Total Bilirubin" "Bilirubin (total)"
"bilirubin (total)" "Bilirubin (total)"
"ALP:Alk.P,tase" "Alkaline Phosphatase"
"alkaline phosphatase" "Alkaline Phosphatase"
"GGT:Gamma G T" "Gamma GT"
"gamma-GT" "Gamma GT"
"ALT:ALT" "ALT (GPT)"
"ALT (GPT)" "ALT (GPT)"
"GLU:Glucose" "Glucose"
"blood sugar" "Glucose"
"CHOL:Total Cholesterol" "Cholesterol (total)"
"cholesterol (total)" "Cholesterol (total)"
"triglycerides" "Triglycerides"
"TG:Triglyceride" "Triglycerides"
"HDLC:HDL Cholesterol" "HDL Cholesterol"
"HDL Cholesterol" "HDL Cholesterol"
"cholesterol (HDL)" "HDL Cholesterol"
"Non-HDL cholesterol" "Non-HDL Cholesterol"
"NHDLC:Non-HDL cholesterol" "Non-HDL Cholesterol"
"LDL cholesterol" "LDL Cholesterol"
"LDL Cholesterol (Calculated)" "LDL Cholesterol"
"LDLC:LDL Cholesterol" "LDL Cholesterol"
"LDLC:LDL Cholesterol (Calculated)" "LDL Cholesterol"
"T.Chol/HDL Ratio" "Total Cholesterol/HDL Ratio"
"Chol/HDL" "Total Cholesterol/HDL Ratio"
"CHOL/HDL" "Total Cholesterol/HDL Ratio"
"TCHDR:T.Chol/HDL Ratio" "Total Cholesterol/HDL Ratio"
"TSH:TSH" "TSH"
"TSH" "TSH"
"CRP:C Reactive Protein" "C Reactive Protein"
"C reactive protein" "C Reactive Protein"
"vit B12" "Vitamin B12"
"B12:Vitamin B12" "B12:Vitamin B12"
"FOL:Folate" "Folate"
"folate" "Folate"
"FER:Ferritin" "Ferritin"
"ferritin" "Ferritin"
"PSA total" "PSA"
"White Cell Count" "White Cell Count"
"WCC:White Cell Count" "White Cell Count"
"leucocytes [WBC]" "White Cell Count"
"HB:Haemoglobin" "Haemoglobin"
"haemoglobin" "Haemoglobin"
"platelets" "Platelets"
"PLT:Platelets" "Platelets"
"RBC" "Red Cell Count"
"RCC:Red Cell Count" "Red Cell Count"
"Red Cell Count" "Red Cell Count"
"HCT:Haematocrit" "Haematocrit"
"haematocrit" "Haematocrit"
"MCV:Mean Cell Volume" "Mean Cell Volume"
"MCV" "Mean Cell Volume"
"MCH" "Mean Cell Haemoglobin"
"MCH:Mean Cell Haemoglobin" "Mean Cell Haemoglobin"
"RDW:RDW" "Red Cell Distribution Width"
"Red Cell Distribution Width [RDW]" "Red Cell Distribution Width"
"neutrophils" "Neutrophils"
"NEUTA:Neutrophils" "Neutrophils"
"lymphocytes" "Lymphocytes"
"LYMA:Lymphocytes" "Lymphocytes"
"monocytes" "Monocytes"
"MONOA:Monocytes" "Monocytes"
"eosinophils" "Eosinophils"
"EOSA:Eosinophils" "Eosinophils"
"basophils" "Basophils"
"BASA:Basophils" "Basophils"
"Nucleated RBC's." "Nucleated RBCs"
"25(OH)Vitamin D" "Vitamin D"
"globulin" "Globulin"
"eGFR" "eGFR"
"total proteins" "Total Proteins"
"HbA1c HPLC (EDTA)" "HbA1c HPLC (EDTA)"
"FE:Serum Iron" "Serum Iron"
"TRFN:Serum Transferrin" "Serum Transferrin"
"TIBC/C:TIBC (Calculated)" "TIBC"
"TSAT:% Transferrin Satn." "Transferrin Saturation %"
})
(def tests (keys test-names))
(defn text-of-pdf
[^String file]
(with-open [pd (PDDocument/load file)]
(let [stripper (PDFTextStripper.)]
(str/split (.getText stripper pd) #"\n"))))
(defn as-test [line]
(let [[test data] (str/split line #": ")
[result units range] (str/split (str/trim data) #" ")]
(sorted-map :test (test-names test) :test-result result
:test-units (when (not= units "-") units)
:test-result-range (when (not= range "-") range))))
(defn as-meta [lines]
(->> lines
(map (fn [x]
(let [[name data] (str/split x #": ")
[data] (str/split data #",")]
[(keyword name) (str/trim data)])))
(reduce conj (sorted-map))))
(defn starting-with? [coll]
(fn [line] (some (fn [t] (str/starts-with? line t)) coll)))
(defn read-file [file]
(let [text (->> file text-of-pdf (filter (fn [line] (str/includes? line ":"))))
metadata (->> text (filter (starting-with? metadata)) as-meta (partial merge))]
(->> text
(filter (starting-with? tests))
(map (comp metadata as-test)))))
(defn write-csv [tests]
(let [hdr (map name (keys (first tests)))]
(with-open [s (StringWriter.)]
(->> tests
(map (fn [line] (vec (vals line))))
(reduce conj [hdr])
(csv/write-csv s))
(str s))))
(println (->> *command-line-args*
(mapcat read-file)
write-csv))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment