Created
June 21, 2018 01:01
-
-
Save tbl3rd/a47f203a5b371933539305d2aaf39e3b to your computer and use it in GitHub Desktop.
Clojure spec Variant Call Format files loosely with examples.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns vcf | |
"Hack VCFs." | |
(:require [clojure.java.io :as io] | |
[clojure.spec.alpha :as s] | |
[clojure.string :as str]) | |
(:import [java.util.zip GZIPInputStream])) | |
;; Sketch a loose spec of a VCF file, but ::vcf-file takes too long, | |
;; so this is mostly just for documentation. | |
;; | |
(letfn [(is? [k v] (fn [x] (= v (apply str (k x)))))] | |
(let [lower "abcdefghijklmnopqrstuvwxyz" | |
upper (str/upper-case lower) | |
digit "0123456789" | |
other "(_)"] | |
(s/def ::whatever (s/* (constantly true))) | |
(s/def ::hash (set "#")) | |
(s/def ::hashes (s/cat :hash0 ::hash :hash1 ::hash)) | |
(s/def ::key (s/+ (set (concat lower upper digit other)))) | |
(s/def ::not-hash (complement (set "#"))) | |
(s/def ::tab #{\tab}) | |
(s/def ::not-tab (complement #{\tab})) | |
(s/def ::column (s/+ ::not-tab)) | |
(s/def ::header (s/cat :tab ::tab :column ::column)) | |
(s/def ::meta-line (s/and string? (s/conformer seq) | |
(s/cat :hashes ::hashes | |
:key ::key | |
:= (set "=") | |
:value ::whatever))) | |
(s/def ::form-line (s/and ::meta-line (is? :key "fileformat"))) | |
(s/def ::head-line (s/and string? (s/conformer seq) | |
(s/cat :hash ::hash | |
:chrom ::column | |
:headers (s/+ ::header)) | |
(is? :chrom "CHROM"))) | |
(s/def ::body-line (s/and string? (s/conformer seq) | |
(s/cat :not-hash ::not-hash | |
:stuff ::whatever))) | |
(s/def ::vcf-line (s/or :form ::form-line | |
:meta ::meta-line | |
:head ::head-line | |
:body ::body-line)) | |
(s/def ::vcf-file (s/cat :form-line ::form-line | |
:meta-lines (s/* ::meta-line) | |
:head-line ::head-line | |
:body-lines (s/* (s/or :head ::head-line | |
:body ::body-line)))))) | |
(defn inflate-lines | |
"Return a lazy sequence of lines from the gzipped VCF file." | |
[vcf] | |
(-> vcf | |
io/input-stream | |
GZIPInputStream. | |
io/reader | |
line-seq)) | |
(defn valid? | |
"True when first 9999 lines of VCF is a valid VCF file. | |
This takes about a minute to run now!" | |
[vcf] | |
(->> vcf | |
inflate-lines | |
(take 9999) | |
(s/valid? ::vcf-file))) | |
(defn parse-lines | |
"Lazily parse the lines of VCF." | |
[vcf] | |
(letfn [(parse [line] (s/conform ::vcf-line line))] | |
(->> vcf | |
inflate-lines | |
(map parse)))) | |
(defmacro make-map | |
"Map SYMBOLS as keywords to their values in the environment." | |
[& symbols] | |
(let [symbols# symbols] | |
(zipmap (map keyword symbols) symbols))) | |
(defn section | |
"Throw or return a map of the sections of the VCF." | |
[vcf] | |
(letfn [(meta? [line] (str/starts-with? line "##")) | |
(head? [line] (str/starts-with? line "#"))] | |
(let [[meta-lines lines] (->> vcf | |
inflate-lines | |
(split-with meta?)) | |
[version-line & meta-lines] meta-lines | |
[header-lines other-lines] (split-with head? lines)] | |
(when-not (s/valid? ::form-line version-line) | |
(throw (new RuntimeException (str "No ##fileformat line in " vcf)))) | |
(when-not (s/valid? ::head-line (first header-lines)) | |
(throw (new RuntimeException (str "No #CHROM header line in " vcf)))) | |
(let [version (->> version-line | |
(s/conform ::form-line) | |
:value | |
(apply str))] | |
(make-map version meta-lines header-lines other-lines))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment