Created June 21, 2018 01:01
Clojure spec Variant Call Format files loosely with examples.
(ns vcf
"Hack VCFs."
(:require [ :as io]
[clojure.spec.alpha :as s]
[clojure.string :as str])
(:import [ GZIPInputStream]))
;; Sketch a loose spec of a VCF file, but ::vcf-file takes too long,
;; so this is mostly just for documentation.
(letfn [(is? [k v] (fn [x] (= v (apply str (k x)))))]
(let [lower "abcdefghijklmnopqrstuvwxyz"
upper (str/upper-case lower)
digit "0123456789"
other "(_)"]
(s/def ::whatever (s/* (constantly true)))
(s/def ::hash (set "#"))
(s/def ::hashes (s/cat :hash0 ::hash :hash1 ::hash))
(s/def ::key (s/+ (set (concat lower upper digit other))))
(s/def ::not-hash (complement (set "#")))
(s/def ::tab #{\tab})
(s/def ::not-tab (complement #{\tab}))
(s/def ::column (s/+ ::not-tab))
(s/def ::header (s/cat :tab ::tab :column ::column))
(s/def ::meta-line (s/and string? (s/conformer seq)
(s/cat :hashes ::hashes
:key ::key
:= (set "=")
:value ::whatever)))
(s/def ::form-line (s/and ::meta-line (is? :key "fileformat")))
(s/def ::head-line (s/and string? (s/conformer seq)
(s/cat :hash ::hash
:chrom ::column
:headers (s/+ ::header))
(is? :chrom "CHROM")))
(s/def ::body-line (s/and string? (s/conformer seq)
(s/cat :not-hash ::not-hash
:stuff ::whatever)))
(s/def ::vcf-line (s/or :form ::form-line
:meta ::meta-line
:head ::head-line
:body ::body-line))
(s/def ::vcf-file (s/cat :form-line ::form-line
:meta-lines (s/* ::meta-line)
:head-line ::head-line
:body-lines (s/* (s/or :head ::head-line
:body ::body-line))))))
(defn inflate-lines
"Return a lazy sequence of lines from the gzipped VCF file."
(-> vcf
(defn valid?
"True when first 9999 lines of VCF is a valid VCF file.
This takes about a minute to run now!"
(->> vcf
(take 9999)
(s/valid? ::vcf-file)))
(defn parse-lines
"Lazily parse the lines of VCF."
(letfn [(parse [line] (s/conform ::vcf-line line))]
(->> vcf
(map parse))))
(defmacro make-map
"Map SYMBOLS as keywords to their values in the environment."
[& symbols]
(let [symbols# symbols]
(zipmap (map keyword symbols) symbols)))
(defn section
"Throw or return a map of the sections of the VCF."
(letfn [(meta? [line] (str/starts-with? line "##"))
(head? [line] (str/starts-with? line "#"))]
(let [[meta-lines lines] (->> vcf
(split-with meta?))
[version-line & meta-lines] meta-lines
[header-lines other-lines] (split-with head? lines)]
(when-not (s/valid? ::form-line version-line)
(throw (new RuntimeException (str "No ##fileformat line in " vcf))))
(when-not (s/valid? ::head-line (first header-lines))
(throw (new RuntimeException (str "No #CHROM header line in " vcf))))
(let [version (->> version-line
(s/conform ::form-line)
(apply str))]
(make-map version meta-lines header-lines other-lines)))))
