ynx0 · July 20, 2021 18:54
diff --git a/vcf2csv.py b/vcf2csv.py
 #!/usr/bin/env python3

 import vobject
 import csv
 import sys

 from functools import reduce
 from collections.abc import Iterable

 from pathlib import Path



 def flatten(l):
 	"""
 	modified form of https://stackoverflow.com/a/2158532
 	"""
 	def is_list(el):
 		return isinstance(el, Iterable) and not isinstance(el, (str, bytes))

 	if not is_list(l):
 		# allow singleton / non-list values, return them as is
 		yield l
 	else:
 		for el in l:
 			if is_list(el):
 				yield from flatten(el)
 			else:
 				yield el

 # this script assumes many contacts in a single vcf file, not many files for a single contact

 def read_vcf(vcf_path):
 	vcf_path = Path(vcf_path)
 	assert vcf_path.suffix == '.vcf', f"Error: tried to read a vcf file, but got one with extension {vcf_path.suffix}"
 	with open(vcf_path, "r") as f:
 		return f.read()


 def cards_from_text(vcf_text):
 	"""
 	reads in all vcf cards from a given stream of vcf text
 	"""
 	return list(vobject.readComponents(vcf_text, validate=True))


 def card_to_row(vcf_card) -> dict:
 	"""
 	every vcf card stores its data in a value called 'contents'
 	contents is a dict which stores attributes and their values
 	each attribute can contain one or more entries in the form of a list
 	collectively we call this list attribute_data, while each element is an attribute value
 	"""

 	processed = {}
 	for attr_name, attr_data in vcf_card.contents.items():
 		#import unicodedata
 		#def filter_non_printable(s):
 		#	""" takes out all control characters, from https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python """
 		#	return ''.join(c for c in s if not unicodedata.category(c).startswith('C'))

 		def process_attr(attr_val):
 			attr_val = attr_val.value
 			attr_val = list(flatten(attr_val))[0]  # removes any nesting if it exists, mainly for org which has an extra layer for no reason
 			attr_val = str(attr_val).strip()
 			# attr_val = filter_non_printable(attr_val)
 			attr_val = attr_val.replace('\xa0', ' ')
 			return attr_val

 		def stringify_data(attr_data):
 			return ', '.join(attr_data)

 		processed[attr_name] = stringify_data([process_attr(attr_val) for attr_val in attr_data])

 	return processed


 def gen_fieldnames(vcards) -> list:
 	# basically extract out the data headers from all the cards and make a vec of them
 	# then we put them all in a single set and then OR them together to get the union
 	# which is the then the list of all possible data types we can find in our dataset
 	# thus generating the proper header for our csv and ensuring no missing nor extraneous columns
 	vcard_attributes_vec = map(lambda x: x.contents.keys(), vcards)
 	return sorted(list(reduce(lambda acc, keys: acc | keys, vcard_attributes_vec)))


 def vcf2csv(vcf_path: Path):
 	vcf_text = read_vcf(vcf_path)
 	vcards = cards_from_text(vcf_text)
 	fieldnames = gen_fieldnames(vcards)
 	with open(f"{vcf_path.stem}.csv", "w", newline='') as f:
 		writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
 		writer.writeheader()
 		for card in vcards:
 			writer.writerow(card_to_row(card))

 def main():
 	if len(sys.argv) <= 1:
 		print(f"Usage: ./vcf2csv.py <path/to/vcf file.vcf>")
 		print("Outputs: <original file name>.csv")
 		sys.exit(1)
 	
 	vcf_path = Path(sys.argv[1])
 	vcf2csv(vcf_path)


 if __name__ == '__main__':
 	main()
	#!/usr/bin/env python3

	import vobject
	import csv
	import sys

	from functools import reduce
	from collections.abc import Iterable

	from pathlib import Path



	def flatten(l):
	"""
	modified form of https://stackoverflow.com/a/2158532
	"""
	def is_list(el):
	return isinstance(el, Iterable) and not isinstance(el, (str, bytes))

	if not is_list(l):
	# allow singleton / non-list values, return them as is
	yield l
	else:
	for el in l:
	if is_list(el):
	yield from flatten(el)
	else:
	yield el

	# this script assumes many contacts in a single vcf file, not many files for a single contact

	def read_vcf(vcf_path):
	vcf_path = Path(vcf_path)
	assert vcf_path.suffix == '.vcf', f"Error: tried to read a vcf file, but got one with extension {vcf_path.suffix}"
	with open(vcf_path, "r") as f:
	return f.read()


	def cards_from_text(vcf_text):
	"""
	reads in all vcf cards from a given stream of vcf text
	"""
	return list(vobject.readComponents(vcf_text, validate=True))


	def card_to_row(vcf_card) -> dict:
	"""
	every vcf card stores its data in a value called 'contents'
	contents is a dict which stores attributes and their values
	each attribute can contain one or more entries in the form of a list
	collectively we call this list attribute_data, while each element is an attribute value
	"""

	processed = {}
	for attr_name, attr_data in vcf_card.contents.items():
	#import unicodedata
	#def filter_non_printable(s):
	# """ takes out all control characters, from https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python """
	# return ''.join(c for c in s if not unicodedata.category(c).startswith('C'))

	def process_attr(attr_val):
	attr_val = attr_val.value
	attr_val = list(flatten(attr_val))[0] # removes any nesting if it exists, mainly for org which has an extra layer for no reason
	attr_val = str(attr_val).strip()
	# attr_val = filter_non_printable(attr_val)
	attr_val = attr_val.replace('\xa0', ' ')
	return attr_val

	def stringify_data(attr_data):
	return ', '.join(attr_data)

	processed[attr_name] = stringify_data([process_attr(attr_val) for attr_val in attr_data])

	return processed


	def gen_fieldnames(vcards) -> list:
	# basically extract out the data headers from all the cards and make a vec of them
	# then we put them all in a single set and then OR them together to get the union
	# which is the then the list of all possible data types we can find in our dataset
	# thus generating the proper header for our csv and ensuring no missing nor extraneous columns
	vcard_attributes_vec = map(lambda x: x.contents.keys(), vcards)
	return sorted(list(reduce(lambda acc, keys: acc \| keys, vcard_attributes_vec)))


	def vcf2csv(vcf_path: Path):
	vcf_text = read_vcf(vcf_path)
	vcards = cards_from_text(vcf_text)
	fieldnames = gen_fieldnames(vcards)
	with open(f"{vcf_path.stem}.csv", "w", newline='') as f:
	writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
	writer.writeheader()
	for card in vcards:
	writer.writerow(card_to_row(card))

	def main():
	if len(sys.argv) <= 1:
	print(f"Usage: ./vcf2csv.py <path/to/vcf file.vcf>")
	print("Outputs: <original file name>.csv")
	sys.exit(1)

	vcf_path = Path(sys.argv[1])
	vcf2csv(vcf_path)


	if __name__ == '__main__':
	main()
No results found