Created
July 20, 2021 18:54
-
-
Save ynx0/8fe4cbcd0087c2906c24d00c13f378ba to your computer and use it in GitHub Desktop.
converts vcf files to csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import vobject | |
import csv | |
import sys | |
from functools import reduce | |
from collections.abc import Iterable | |
from pathlib import Path | |
def flatten(l): | |
""" | |
modified form of https://stackoverflow.com/a/2158532 | |
""" | |
def is_list(el): | |
return isinstance(el, Iterable) and not isinstance(el, (str, bytes)) | |
if not is_list(l): | |
# allow singleton / non-list values, return them as is | |
yield l | |
else: | |
for el in l: | |
if is_list(el): | |
yield from flatten(el) | |
else: | |
yield el | |
# this script assumes many contacts in a single vcf file, not many files for a single contact | |
def read_vcf(vcf_path): | |
vcf_path = Path(vcf_path) | |
assert vcf_path.suffix == '.vcf', f"Error: tried to read a vcf file, but got one with extension {vcf_path.suffix}" | |
with open(vcf_path, "r") as f: | |
return f.read() | |
def cards_from_text(vcf_text): | |
""" | |
reads in all vcf cards from a given stream of vcf text | |
""" | |
return list(vobject.readComponents(vcf_text, validate=True)) | |
def card_to_row(vcf_card) -> dict: | |
""" | |
every vcf card stores its data in a value called 'contents' | |
contents is a dict which stores attributes and their values | |
each attribute can contain one or more entries in the form of a list | |
collectively we call this list attribute_data, while each element is an attribute value | |
""" | |
processed = {} | |
for attr_name, attr_data in vcf_card.contents.items(): | |
#import unicodedata | |
#def filter_non_printable(s): | |
# """ takes out all control characters, from https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python """ | |
# return ''.join(c for c in s if not unicodedata.category(c).startswith('C')) | |
def process_attr(attr_val): | |
attr_val = attr_val.value | |
attr_val = list(flatten(attr_val))[0] # removes any nesting if it exists, mainly for org which has an extra layer for no reason | |
attr_val = str(attr_val).strip() | |
# attr_val = filter_non_printable(attr_val) | |
attr_val = attr_val.replace('\xa0', ' ') | |
return attr_val | |
def stringify_data(attr_data): | |
return ', '.join(attr_data) | |
processed[attr_name] = stringify_data([process_attr(attr_val) for attr_val in attr_data]) | |
return processed | |
def gen_fieldnames(vcards) -> list: | |
# basically extract out the data headers from all the cards and make a vec of them | |
# then we put them all in a single set and then OR them together to get the union | |
# which is the then the list of all possible data types we can find in our dataset | |
# thus generating the proper header for our csv and ensuring no missing nor extraneous columns | |
vcard_attributes_vec = map(lambda x: x.contents.keys(), vcards) | |
return sorted(list(reduce(lambda acc, keys: acc | keys, vcard_attributes_vec))) | |
def vcf2csv(vcf_path: Path): | |
vcf_text = read_vcf(vcf_path) | |
vcards = cards_from_text(vcf_text) | |
fieldnames = gen_fieldnames(vcards) | |
with open(f"{vcf_path.stem}.csv", "w", newline='') as f: | |
writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t') | |
writer.writeheader() | |
for card in vcards: | |
writer.writerow(card_to_row(card)) | |
def main(): | |
if len(sys.argv) <= 1: | |
print(f"Usage: ./vcf2csv.py <path/to/vcf file.vcf>") | |
print("Outputs: <original file name>.csv") | |
sys.exit(1) | |
vcf_path = Path(sys.argv[1]) | |
vcf2csv(vcf_path) | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment