Created
August 2, 2024 17:49
-
-
Save eweitz/ab559f0a5a32997b19f2c5e7498727b0 to your computer and use it in GitHub Desktop.
Find less common string characters in Cell Ontology
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Find less common string characters in Cell Ontology | |
| Prints characters that aren't an ASCII letter, number, hyphen, space, or | |
| underscore that appear among standard labels in Cell Ontology (CL). | |
| This snippet helps get testing input for more robust ontology handling. | |
| Tested on Python 3.10.9, without dependencies. Output as of 2024-08-02: | |
| .=/[:]–><';,()+ | |
| """ | |
| import xml.etree.ElementTree as ET | |
| import re | |
| import urllib.request | |
| from pathlib import Path | |
| # Fetch ontology file if not already cached | |
| if not Path('cl.owl').is_file(): | |
| # Official Cell Ontology OWL URL, per | |
| # https://www.ebi.ac.uk/ols4/ontologies/cl | |
| cl_url = 'http://purl.obolibrary.org/obo/cl.owl' | |
| with urllib.request.urlopen(cl_url) as f: | |
| cell_ontology_owl = f.read() | |
| with open('cl.owl', 'wb') as f: | |
| f.write(cell_ontology_owl) | |
| # Parse ontology XML | |
| tree = ET.parse('cl.owl') | |
| root = tree.getroot() | |
| namespaces = {'rdfs': 'http://www.w3.org/2000/01/rdf-schema#'} | |
| labels = root.findall('.//rdfs:label', namespaces) | |
| # Find uncommon characters, dedup them | |
| finds = list(map(lambda label: re.findall('[^A-Za-z0-9-_ ]', label.text), labels)) | |
| filtered_finds = list(filter(lambda match: len(match) > 0, finds)) | |
| raw_unique_finds = set([char for find in filtered_finds for char in find]) | |
| unique_finds = "".join(list(raw_unique_finds)) | |
| print('unique_finds') | |
| print(unique_finds) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment