Last active
August 31, 2023 19:52
-
-
Save nvictus/c72a07df052560f24c75576636892a16 to your computer and use it in GitHub Desktop.
Extract info fields from a VCF dataframe
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from __future__ import annotations\n", | |
"from io import BytesIO\n", | |
"\n", | |
"import numpy as np\n", | |
"import oxbow as ox\n", | |
"import pandas as pd\n", | |
"import polars as pl\n", | |
"import pyarrow as pa\n", | |
"import pysam" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# This dictionary maps the VCF-derived input types to numpy/pandas dtypes\n", | |
"TYPE_MAP = {\n", | |
" \"Integer\": \"int64\",\n", | |
" \"Float\": \"float64\",\n", | |
" \"String\": \"object\",\n", | |
" \"Flag\": \"bool\",\n", | |
"}\n", | |
"\n", | |
"\n", | |
"def read_vcf_as_pandas(vcf_path: str) -> pd.DataFrame:\n", | |
" ipc = ox.read_vcf(vcf_path)\n", | |
" return pa.ipc.open_file(BytesIO(ipc)).read_pandas()\n", | |
"\n", | |
"\n", | |
"def read_vcf_as_polars(vcf_path: str) -> pl.DataFrame:\n", | |
" ipc = ox.read_vcf(vcf_path)\n", | |
" return pl.read_ipc(ipc)\n", | |
"\n", | |
"\n", | |
"def read_info_schema(vcf_path: str) -> pd.DataFrame:\n", | |
" \"\"\" \n", | |
" Read the schema of the INFO column of a VCF.\n", | |
"\n", | |
" This currently uses pysam.\n", | |
"\n", | |
" Parameters\n", | |
" ----------\n", | |
" vcf_path : str\n", | |
" Path to bgzipped vcf, with tabix indexed .tbi file with similar name \n", | |
" in same folder.\n", | |
" \n", | |
" Returns\n", | |
" -------\n", | |
" pd.DataFrame\n", | |
" Dataframe with [name, number, type] columns\n", | |
"\n", | |
" Notes\n", | |
" -----\n", | |
" Possible values for `type` are: \"Integer\", \"Float\", \"String\", \"Flag\".\n", | |
"\n", | |
" Possible values for `number` are:\n", | |
" - An integer (e.g. 0, 1, 2, 3, 4, etc.) - for fields where the number\n", | |
" of values per VCF record is fixed. 0 means the field is a \"Flag\".\n", | |
" - A string (\"A\", \"G\", \"R\") - for fields where the number of\n", | |
" values per VCF record is determined by the number of alts, the total\n", | |
" number of alleles, or the number of genotypes, respectively.\n", | |
" - A dot (\".\") - for fields where the number of values per VCF record\n", | |
" varies, is unknown, or is unbounded.\n", | |
" \"\"\"\n", | |
" with pysam.VariantFile(vcf_path) as f:\n", | |
" return pd.DataFrame(\n", | |
" [(obj.name, obj.number, obj.type) for obj in f.header.info.values()],\n", | |
" columns=[\"name\", \"number\", \"type\"],\n", | |
" )\n", | |
"\n", | |
"\n", | |
"def _parse_into_info_record(\n", | |
" pairs: list[str], \n", | |
" fields: set[str],\n", | |
" dtype_map: pd.Series, \n", | |
" arity_map: pd.Series\n", | |
") -> dict:\n", | |
" \"\"\"\n", | |
" Parse a sequence of key-value INFO strings into a dictionary.\n", | |
"\n", | |
" Parameters\n", | |
" ----------\n", | |
" pairs : list[str]\n", | |
" List of `{key}={value}` pairs in the INFO column of a VCF. Note that\n", | |
" Flag fields will be represented as `{key}` only and variadic values\n", | |
" will be separated by commas.\n", | |
" fields : set[str]\n", | |
" Set of fields to extract from the INFO column.\n", | |
" dtype_map : pd.Series\n", | |
" Series mapping INFO field names to a corresponding numpy dtype.\n", | |
" arity_map : pd.Series\n", | |
" Series mapping INFO field names to a corresponding number or \"arity\".\n", | |
"\n", | |
" Returns\n", | |
" -------\n", | |
" dict\n", | |
" Dictionary mapping INFO field names to their values.\n", | |
" \"\"\"\n", | |
" record = {}\n", | |
" for pair in pairs:\n", | |
" key, *value = pair.split(\"=\")\n", | |
" if key in fields:\n", | |
" arity = arity_map[key]\n", | |
" dtype = dtype_map[key]\n", | |
" if arity == 0:\n", | |
" record[key] = True\n", | |
" elif arity == 1:\n", | |
" record[key] = np.array(value[0], dtype=dtype)\n", | |
" else:\n", | |
" record[key] = np.array(value[0].split(\",\"), dtype=dtype)\n", | |
" return record\n", | |
" \n", | |
"\n", | |
"def extract_info(\n", | |
" info: pd.Series, \n", | |
" schema: pd.DataFrame, \n", | |
" fields: list[str] | None = None\n", | |
") -> pd.DataFrame:\n", | |
" \"\"\"\n", | |
" Extracts fields from INFO column of a VCF.\n", | |
" \n", | |
" Parameters\n", | |
" ----------\n", | |
" info : pd.Series\n", | |
" The info column of a VCF dataframe.\n", | |
" schema : pd.DataFrame\n", | |
" Dataframe with [name, number, type] columns describing the INFO fields\n", | |
" of a VCF.\n", | |
" fields : list[str], optional\n", | |
" List of fields to extract from the INFO column. If None, all fields\n", | |
" will be extracted.\n", | |
"\n", | |
" Returns\n", | |
" -------\n", | |
" pd.DataFrame\n", | |
" Dataframe with columns corresponding to the requested fields.\n", | |
" \"\"\"\n", | |
" if fields is None:\n", | |
" fields = list(schema[\"name\"])\n", | |
" else:\n", | |
" names_available = set(schema[\"name\"])\n", | |
" for field in fields:\n", | |
" if field not in names_available:\n", | |
" raise ValueError(f\"Field '{field}' not found in INFO schema.\")\n", | |
" \n", | |
" # Create series mapping field names to dtypes and arities\n", | |
" dtype_map = schema.set_index(\"name\")[\"type\"].map(TYPE_MAP)\n", | |
" arity_map = schema.set_index(\"name\")[\"number\"]\n", | |
"\n", | |
" # Split the key-value pairs in the info column into record dictionaries\n", | |
" records = (\n", | |
" info\n", | |
" .str\n", | |
" .split(\";\")\n", | |
" .apply(\n", | |
" _parse_into_info_record, \n", | |
" fields=set(fields),\n", | |
" dtype_map=dtype_map, \n", | |
" arity_map=arity_map\n", | |
" )\n", | |
" )\n", | |
"\n", | |
" # Convert to dataframe\n", | |
" info_df = pd.DataFrame.from_records(records).convert_dtypes()\n", | |
"\n", | |
" # Include columns for requested fields that are missing from INFO records \n", | |
" # but still declared in the schema\n", | |
" for field in fields:\n", | |
" if field not in info_df.columns:\n", | |
" info_df[field] = pd.NA\n", | |
"\n", | |
" # Reorder columns as requested\n", | |
" info_df = info_df[fields]\n", | |
" return info_df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>name</th>\n", | |
" <th>number</th>\n", | |
" <th>type</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>CONFLICT</td>\n", | |
" <td>.</td>\n", | |
" <td>String</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>AC</td>\n", | |
" <td>A</td>\n", | |
" <td>Integer</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>AF</td>\n", | |
" <td>A</td>\n", | |
" <td>Float</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>NS</td>\n", | |
" <td>1</td>\n", | |
" <td>Integer</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>AN</td>\n", | |
" <td>1</td>\n", | |
" <td>Integer</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>LV</td>\n", | |
" <td>1</td>\n", | |
" <td>Integer</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>PS</td>\n", | |
" <td>1</td>\n", | |
" <td>String</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>AT</td>\n", | |
" <td>R</td>\n", | |
" <td>String</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" name number type\n", | |
"0 CONFLICT . String\n", | |
"1 AC A Integer\n", | |
"2 AF A Float\n", | |
"3 NS 1 Integer\n", | |
"4 AN 1 Integer\n", | |
"5 LV 1 Integer\n", | |
"6 PS 1 String\n", | |
"7 AT R String" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vcf_path = \"DRR.vcf.gz\"\n", | |
"df = read_vcf_as_pandas(vcf_path)\n", | |
"schema = read_info_schema(vcf_path)\n", | |
"schema" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>chrom</th>\n", | |
" <th>pos</th>\n", | |
" <th>id</th>\n", | |
" <th>ref</th>\n", | |
" <th>alt</th>\n", | |
" <th>qual</th>\n", | |
" <th>filter</th>\n", | |
" <th>format</th>\n", | |
" <th>AC</th>\n", | |
" <th>LV</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>genome</td>\n", | |
" <td>18</td>\n", | |
" <td>>8>11</td>\n", | |
" <td>G</td>\n", | |
" <td>T</td>\n", | |
" <td>60.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>GT:AD</td>\n", | |
" <td>[12]</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>genome</td>\n", | |
" <td>26</td>\n", | |
" <td>>12>16</td>\n", | |
" <td>T</td>\n", | |
" <td>G,C</td>\n", | |
" <td>60.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>GT:AD</td>\n", | |
" <td>[2, 5]</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>genome</td>\n", | |
" <td>28</td>\n", | |
" <td>>16>19</td>\n", | |
" <td>A</td>\n", | |
" <td>G</td>\n", | |
" <td>60.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>GT:AD</td>\n", | |
" <td>[6]</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>genome</td>\n", | |
" <td>30</td>\n", | |
" <td>>19>22</td>\n", | |
" <td>G</td>\n", | |
" <td>A</td>\n", | |
" <td>60.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>GT:AD</td>\n", | |
" <td>[54]</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>genome</td>\n", | |
" <td>35</td>\n", | |
" <td>>22>27</td>\n", | |
" <td>GG</td>\n", | |
" <td>AG,GT</td>\n", | |
" <td>60.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>GT:AD</td>\n", | |
" <td>[2, 4]</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>482</th>\n", | |
" <td>genome</td>\n", | |
" <td>9544</td>\n", | |
" <td>>2561>2564</td>\n", | |
" <td>T</td>\n", | |
" <td>C</td>\n", | |
" <td>60.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>GT:AD</td>\n", | |
" <td>[2]</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>483</th>\n", | |
" <td>genome</td>\n", | |
" <td>9592</td>\n", | |
" <td>>2566>2569</td>\n", | |
" <td>C</td>\n", | |
" <td>T</td>\n", | |
" <td>60.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>GT:AD</td>\n", | |
" <td>[2]</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>484</th>\n", | |
" <td>genome</td>\n", | |
" <td>9598</td>\n", | |
" <td>>2569>2572</td>\n", | |
" <td>G</td>\n", | |
" <td>A</td>\n", | |
" <td>60.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>GT:AD</td>\n", | |
" <td>[20]</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>485</th>\n", | |
" <td>genome</td>\n", | |
" <td>9634</td>\n", | |
" <td>>2574>2577</td>\n", | |
" <td>T</td>\n", | |
" <td>C</td>\n", | |
" <td>60.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>GT:AD</td>\n", | |
" <td>[10]</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>486</th>\n", | |
" <td>genome</td>\n", | |
" <td>9676</td>\n", | |
" <td>>2581>2584</td>\n", | |
" <td>A</td>\n", | |
" <td>G</td>\n", | |
" <td>60.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>GT:AD</td>\n", | |
" <td>[83]</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>487 rows × 10 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" chrom pos id ref alt qual filter format AC LV\n", | |
"0 genome 18 >8>11 G T 60.0 NaN GT:AD [12] 0\n", | |
"1 genome 26 >12>16 T G,C 60.0 NaN GT:AD [2, 5] 0\n", | |
"2 genome 28 >16>19 A G 60.0 NaN GT:AD [6] 0\n", | |
"3 genome 30 >19>22 G A 60.0 NaN GT:AD [54] 0\n", | |
"4 genome 35 >22>27 GG AG,GT 60.0 NaN GT:AD [2, 4] 0\n", | |
".. ... ... ... .. ... ... ... ... ... ..\n", | |
"482 genome 9544 >2561>2564 T C 60.0 NaN GT:AD [2] 0\n", | |
"483 genome 9592 >2566>2569 C T 60.0 NaN GT:AD [2] 0\n", | |
"484 genome 9598 >2569>2572 G A 60.0 NaN GT:AD [20] 0\n", | |
"485 genome 9634 >2574>2577 T C 60.0 NaN GT:AD [10] 0\n", | |
"486 genome 9676 >2581>2584 A G 60.0 NaN GT:AD [83] 0\n", | |
"\n", | |
"[487 rows x 10 columns]" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"info = extract_info(df[\"info\"], schema, [\"AC\", \"LV\"])\n", | |
"out = df.drop(columns=[\"info\"]).assign(**info)\n", | |
"out = out.fillna(pd.NA)\n", | |
"out" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "vcfcodeathon2023", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.4" | |
}, | |
"orig_nbformat": 4 | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment