Skip to content

Instantly share code, notes, and snippets.

@ynx0
Created July 20, 2021 18:54
Show Gist options
  • Save ynx0/8fe4cbcd0087c2906c24d00c13f378ba to your computer and use it in GitHub Desktop.
Save ynx0/8fe4cbcd0087c2906c24d00c13f378ba to your computer and use it in GitHub Desktop.
converts vcf files to csv
#!/usr/bin/env python3
import vobject
import csv
import sys
from functools import reduce
from collections.abc import Iterable
from pathlib import Path
def flatten(l):
"""
modified form of https://stackoverflow.com/a/2158532
"""
def is_list(el):
return isinstance(el, Iterable) and not isinstance(el, (str, bytes))
if not is_list(l):
# allow singleton / non-list values, return them as is
yield l
else:
for el in l:
if is_list(el):
yield from flatten(el)
else:
yield el
# this script assumes many contacts in a single vcf file, not many files for a single contact
def read_vcf(vcf_path):
vcf_path = Path(vcf_path)
assert vcf_path.suffix == '.vcf', f"Error: tried to read a vcf file, but got one with extension {vcf_path.suffix}"
with open(vcf_path, "r") as f:
return f.read()
def cards_from_text(vcf_text):
"""
reads in all vcf cards from a given stream of vcf text
"""
return list(vobject.readComponents(vcf_text, validate=True))
def card_to_row(vcf_card) -> dict:
"""
every vcf card stores its data in a value called 'contents'
contents is a dict which stores attributes and their values
each attribute can contain one or more entries in the form of a list
collectively we call this list attribute_data, while each element is an attribute value
"""
processed = {}
for attr_name, attr_data in vcf_card.contents.items():
#import unicodedata
#def filter_non_printable(s):
# """ takes out all control characters, from https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python """
# return ''.join(c for c in s if not unicodedata.category(c).startswith('C'))
def process_attr(attr_val):
attr_val = attr_val.value
attr_val = list(flatten(attr_val))[0] # removes any nesting if it exists, mainly for org which has an extra layer for no reason
attr_val = str(attr_val).strip()
# attr_val = filter_non_printable(attr_val)
attr_val = attr_val.replace('\xa0', ' ')
return attr_val
def stringify_data(attr_data):
return ', '.join(attr_data)
processed[attr_name] = stringify_data([process_attr(attr_val) for attr_val in attr_data])
return processed
def gen_fieldnames(vcards) -> list:
# basically extract out the data headers from all the cards and make a vec of them
# then we put them all in a single set and then OR them together to get the union
# which is the then the list of all possible data types we can find in our dataset
# thus generating the proper header for our csv and ensuring no missing nor extraneous columns
vcard_attributes_vec = map(lambda x: x.contents.keys(), vcards)
return sorted(list(reduce(lambda acc, keys: acc | keys, vcard_attributes_vec)))
def vcf2csv(vcf_path: Path):
vcf_text = read_vcf(vcf_path)
vcards = cards_from_text(vcf_text)
fieldnames = gen_fieldnames(vcards)
with open(f"{vcf_path.stem}.csv", "w", newline='') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
writer.writeheader()
for card in vcards:
writer.writerow(card_to_row(card))
def main():
if len(sys.argv) <= 1:
print(f"Usage: ./vcf2csv.py <path/to/vcf file.vcf>")
print("Outputs: <original file name>.csv")
sys.exit(1)
vcf_path = Path(sys.argv[1])
vcf2csv(vcf_path)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment