Created
June 28, 2025 12:15
-
-
Save dridk/3be7d1994b7e11b95579d70c3a223c61 to your computer and use it in GitHub Desktop.
extraction des code CIM10 depuis un fichier RDF dans un fichier parquet
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import rdflib | |
| import polars | |
| # Chargement de la terminologie | |
| g = rdflib.Graph() | |
| g.parse("cim10.rdf") | |
| # Requete SPARQL pour récupérer les variables d'interets | |
| query = """ | |
| PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | |
| PREFIX skos: <http://www.w3.org/2004/02/skos/core#> | |
| PREFIX skos: <http://www.w3.org/2004/02/skos/core#> | |
| PREFIX xkos: <http://rdf-vocabulary.ddialliance.org/xkos#> | |
| PREFIX dc: <http://purl.org/dc/elements/1.1/> | |
| PREFIX atih: <http://data.esante.gouv.fr/atih/> | |
| SELECT ?concept ?code ?label ?path ?type ?synonymes ?inclusion_note ?exclusion_note | |
| WHERE { | |
| ?concept rdfs:subClassOf* atih:cim10 . | |
| ?concept rdfs:label ?label. | |
| ?concept skos:notation ?code. | |
| ?concept rdfs:subClassOf+ ?superClass. | |
| ?superClass skos:notation ?path. | |
| ?concept dc:type ?type. | |
| OPTIONAL { ?concept skos:altLabel ?synonymes. } | |
| OPTIONAL { ?concept atih:inclusionNote ?inclusion_note . } | |
| OPTIONAL { ?concept atih:exclusionNote ?exclusion_note . } | |
| } | |
| """ | |
| # Execution de la requete SPARQL | |
| records = g.query(sparql) | |
| # Génération d'un dataframe pola.rs | |
| columns = [str(i) for i in records.vars] | |
| recs = [] | |
| for rec in records: | |
| if isinstance(rec, tuple): | |
| recs.append(rec) | |
| else: | |
| raise TypeError("Records must contains iterable ") | |
| df = pl.DataFrame([{str.upper(columns[i]): str(v) for i, v in enumerate(rec)} for rec in recs]) | |
| df = df.group_by("CONCEPT").agg( | |
| pl.col("CODE").first(), | |
| pl.col("CODE").first().str.replace("\.", "").alias("CODE_2"), | |
| pl.col("LABEL").first(), | |
| pl.col("PATH").reverse(), | |
| pl.col("SYNONYMES").drop_nulls(), | |
| pl.col("TYPE").first(), | |
| pl.col("INCLUSION_NOTE").first(), | |
| pl.col("EXCLUSION_NOTE").first(), | |
| ) | |
| df.write_parquet("cim10.parquet") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment