eweitz · August 2, 2024 17:49
diff --git a/uncommon_ontology_chars.py b/uncommon_ontology_chars.py
 """Find less common string characters in Cell Ontology

 Prints characters that aren't an ASCII letter, number, hyphen, space, or
 underscore that appear among standard labels in Cell Ontology (CL).
 This snippet helps get testing input for more robust ontology handling.

 Tested on Python 3.10.9, without dependencies.  Output as of 2024-08-02:
 .=/[:]–><';,()+
 """

 import xml.etree.ElementTree as ET
 import re
 import urllib.request
 from pathlib import Path

 # Fetch ontology file if not already cached
 if not Path('cl.owl').is_file():
    # Official Cell Ontology OWL URL, per
    # https://www.ebi.ac.uk/ols4/ontologies/cl
    cl_url = 'http://purl.obolibrary.org/obo/cl.owl'
    with urllib.request.urlopen(cl_url) as f:
        cell_ontology_owl = f.read()

    with open('cl.owl', 'wb') as f:
        f.write(cell_ontology_owl)

 # Parse ontology XML
 tree = ET.parse('cl.owl')
 root = tree.getroot()
 namespaces = {'rdfs': 'http://www.w3.org/2000/01/rdf-schema#'}
 labels = root.findall('.//rdfs:label', namespaces)

 # Find uncommon characters, dedup them
 finds = list(map(lambda label: re.findall('[^A-Za-z0-9-_ ]', label.text), labels))
 filtered_finds = list(filter(lambda match: len(match) > 0, finds))
 raw_unique_finds = set([char for find in filtered_finds for char in find])
 unique_finds = "".join(list(raw_unique_finds))

 print('unique_finds')
 print(unique_finds)
	"""Find less common string characters in Cell Ontology

	Prints characters that aren't an ASCII letter, number, hyphen, space, or
	underscore that appear among standard labels in Cell Ontology (CL).
	This snippet helps get testing input for more robust ontology handling.

	Tested on Python 3.10.9, without dependencies. Output as of 2024-08-02:
	.=/[:]–><';,()+
	"""

	import xml.etree.ElementTree as ET
	import re
	import urllib.request
	from pathlib import Path

	# Fetch ontology file if not already cached
	if not Path('cl.owl').is_file():
	# Official Cell Ontology OWL URL, per
	# https://www.ebi.ac.uk/ols4/ontologies/cl
	cl_url = 'http://purl.obolibrary.org/obo/cl.owl'
	with urllib.request.urlopen(cl_url) as f:
	cell_ontology_owl = f.read()

	with open('cl.owl', 'wb') as f:
	f.write(cell_ontology_owl)

	# Parse ontology XML
	tree = ET.parse('cl.owl')
	root = tree.getroot()
	namespaces = {'rdfs': 'http://www.w3.org/2000/01/rdf-schema#'}
	labels = root.findall('.//rdfs:label', namespaces)

	# Find uncommon characters, dedup them
	finds = list(map(lambda label: re.findall('[^A-Za-z0-9-_ ]', label.text), labels))
	filtered_finds = list(filter(lambda match: len(match) > 0, finds))
	raw_unique_finds = set([char for find in filtered_finds for char in find])
	unique_finds = "".join(list(raw_unique_finds))

	print('unique_finds')
	print(unique_finds)
No results found