Skip to content

Instantly share code, notes, and snippets.

@jze
Last active July 14, 2023 10:23
Show Gist options
  • Save jze/384ae819aae241753d74632569818924 to your computer and use it in GitHub Desktop.
Save jze/384ae819aae241753d74632569818924 to your computer and use it in GitHub Desktop.
Open Data Metadata Quality
#!/usr/bin/env python
# coding: utf-8
# MIT License
#
# Copyright (c) 2021 Dr. Jesper Zedlitz <[email protected]>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import urllib.parse
import requests
import pandas as pd
from io import StringIO
SPARQL_ENDPOINT = "https://www.govdata.de/sparql"
def query_for_contributors( query):
url = SPARQL_ENDPOINT + "?query=" + urllib.parse.quote(query)
r = requests.get(url, headers={'Accept': 'text/csv'})
data = StringIO(r.text)
df = pd.read_csv(data)
df['contributor'] = df['contributor'].apply(lambda s: s.replace('http://dcat-ap.de/def/contributors/', ''))
df = df.groupby('contributor').sum()
df.reset_index(inplace=True)
return df
# Zähle Datensätze und Distributionen
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?distribution) AS ?total) WHERE {
?dataset a dcat:Dataset .
?dataset dcat:distribution ?distribution .
?distribution a dcat:Distribution .
?dataset dcatde:contributorID ?contributor .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
} GROUP BY ?contributor"""
all_distributions = query_for_contributors(query)
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?dataset) AS ?total) WHERE {
?dataset a dcat:Dataset .
?dataset dcatde:contributorID ?contributor .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
} GROUP BY ?contributor"""
all_datasets = query_for_contributors(query)
# Zugänglichkeit - Download URL
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?distribution) AS ?accessibility_download) WHERE {
?dataset a dcat:Dataset .
?dataset dcat:distribution ?distribution .
?distribution a dcat:Distribution .
?dataset dcatde:contributorID ?contributor .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER(NOT EXISTS { ?distribution dcat:downloadURL ?x. })
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_distributions = all_distributions.merge(df,how='left')
# Wiederverwendbarkeit - Zugangsbeschränkungsangaben
query = """PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?dataset) AS ?reusability_access_rights) WHERE {
?dataset a dcat:Dataset .
?dataset dcatde:contributorID ?contributor .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER(NOT EXISTS { ?dataset dct:type <http://dcat-ap.de/def/datasetTypes/collection> })
FILTER(NOT EXISTS { ?dataset dct:accessRights ?x. })
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_datasets = all_datasets.merge(df,how='left')
# Wiederverwendbarkeit - Lizenzangaben
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?distribution) AS ?reusability_license_information) WHERE {
?dataset a dcat:Dataset .
?dataset dcat:distribution ?distribution .
?distribution a dcat:Distribution .
?dataset dcatde:contributorID ?contributor .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER(NOT EXISTS { ?distribution dct:license ?x. })
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_distributions = all_distributions.merge(df,how='left')
# Wiederverwendbarkeit - Zugangsbeschränkungsangaben aus Vokabular
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?dataset) AS ?reusability_access_rights_vocabulary) WHERE {
?dataset a dcat:Dataset .
?dataset dcatde:contributorID ?contributor .
OPTIONAL { ?dataset dct:accessRights ?rights } .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER(NOT EXISTS { ?dataset dct:type <http://dcat-ap.de/def/datasetTypes/collection> })
FILTER( !regex(str(?rights), "^http://publications.europa.eu/resource/authority/access-right/" ) )
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_datasets = all_datasets.merge(df,how='left')
# Wiederverwendbarkeit - Kontaktinformation
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?dataset) AS ?reusability_contact) WHERE {
?dataset a dcat:Dataset .
?dataset dcatde:contributorID ?contributor .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER(NOT EXISTS { ?dataset dct:type <http://dcat-ap.de/def/datasetTypes/collection> })
FILTER(NOT EXISTS { ?dataset dcat:contactPoint ?x. })
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_datasets = all_datasets.merge(df,how='left')
# Wiederverwendbarkeit - Herausgeber
query = """PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?dataset) AS ?reusability_publisher) WHERE {
?dataset a dcat:Dataset .
?dataset dcatde:contributorID ?contributor .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER(NOT EXISTS { ?dataset dct:type <http://dcat-ap.de/def/datasetTypes/collection> })
FILTER(NOT EXISTS { ?dataset dct:publisher ?x. })
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_datasets = all_datasets.merge(df,how='left')
# Kontext - Rechte
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?dataset) AS ?contextuality_rights) WHERE {
?dataset a dcat:Dataset .
?dataset dcatde:contributorID ?contributor .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER(NOT EXISTS { ?dataset dct:type <http://dcat-ap.de/def/datasetTypes/collection> })
FILTER(NOT EXISTS { ?dataset dct:rights ?x. })
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_datasets = all_datasets.merge(df,how='left')
# Kontext - Dataset: Änderungsdatum
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?dataset) AS ?contextuality_dataset_modified) WHERE {
?dataset a dcat:Dataset .
?dataset dcatde:contributorID ?contributor .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER(NOT EXISTS { ?dataset dct:type <http://dcat-ap.de/def/datasetTypes/collection> })
FILTER(NOT EXISTS { ?dataset dct:modified ?x. })
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_datasets = all_datasets.merge(df,how='left')
# Kontext - Dataset: Ausstellungsdatum
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?dataset) AS ?contextuality_dataset_issued) WHERE {
?dataset a dcat:Dataset .
?dataset dcatde:contributorID ?contributor .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER(NOT EXISTS { ?dataset dct:type <http://dcat-ap.de/def/datasetTypes/collection> })
FILTER(NOT EXISTS { ?dataset dct:issued ?x. })
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_datasets = all_datasets.merge(df,how='left')
# Kontext - Distribution: Änderungsdatum
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?distribution) AS ?contextuality_distribution_modified) WHERE {
?dataset a dcat:Dataset .
?dataset dcat:distribution ?distribution .
?distribution a dcat:Distribution .
?dataset dcatde:contributorID ?contributor .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER(NOT EXISTS { ?distribution dct:modified ?x. })
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_distributions = all_distributions.merge(df,how='left')
# Kontext - Distribution: Ausstellungsdatum
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?distribution) AS ?contextuality_distribution_issued) WHERE {
?dataset a dcat:Dataset .
?dataset dcat:distribution ?distribution .
?distribution a dcat:Distribution .
?dataset dcatde:contributorID ?contributor .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER(NOT EXISTS { ?distribution dct:issued ?x. })
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_distributions = all_distributions.merge(df,how='left')
# Kontext - Dateigröße
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?distribution) AS ?contextuality_file_size) WHERE {
?dataset a dcat:Dataset .
?dataset dcat:distribution ?distribution .
?distribution a dcat:Distribution .
?dataset dcatde:contributorID ?contributor .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER(NOT EXISTS { ?distribution dcat:byteSize ?x. })
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_distributions = all_distributions.merge(df,how='left')
# Auffindbarkeit - Schlüsselwörter
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?dataset) AS ?findability_keyword) WHERE {
?dataset a dcat:Dataset .
?dataset dcatde:contributorID ?contributor .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER(NOT EXISTS { ?dataset dct:type <http://dcat-ap.de/def/datasetTypes/collection> })
FILTER(NOT EXISTS { ?dataset dcat:keyword ?x. })
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_datasets = all_datasets.merge(df,how='left')
# Auffindbarkeit - Kategorien
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?dataset) AS ?findability_category) WHERE {
?dataset a dcat:Dataset .
?dataset dcatde:contributorID ?contributor .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER(NOT EXISTS { ?dataset dct:type <http://dcat-ap.de/def/datasetTypes/collection> })
FILTER(NOT EXISTS { ?dataset dcat:theme ?x. })
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_datasets = all_datasets.merge(df,how='left')
# Auffindbarkeit - Ortsbezogene Suche
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?dataset) AS ?findability_geo) WHERE {
?dataset a dcat:Dataset .
?dataset dcatde:contributorID ?contributor .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER(NOT EXISTS { ?dataset dct:type <http://dcat-ap.de/def/datasetTypes/collection> })
FILTER(NOT EXISTS { ?dataset dct:spatial ?x. })
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_datasets = all_datasets.merge(df,how='left')
# Auffindbarkeit - Zeitbasierte Suche
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?dataset) AS ?findability_time) WHERE {
?dataset a dcat:Dataset .
?dataset dcatde:contributorID ?contributor .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER(NOT EXISTS { ?dataset dct:type <http://dcat-ap.de/def/datasetTypes/collection> })
FILTER(NOT EXISTS { ?dataset dct:temporal ?x. })
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_datasets = all_datasets.merge(df,how='left')
# Interoperabilität - Format
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?distribution) AS ?interoperability_format) WHERE {
?dataset a dcat:Dataset .
?dataset dcat:distribution ?distribution .
?distribution a dcat:Distribution .
?dataset dcatde:contributorID ?contributor .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER(NOT EXISTS { ?distribution dct:format ?x. })
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_distributions = all_distributions.merge(df,how='left')
# Interoperabilität - Media Type
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?distribution) AS ?interoperability_media_type) WHERE {
?dataset a dcat:Dataset .
?dataset dcat:distribution ?distribution .
?distribution a dcat:Distribution .
?dataset dcatde:contributorID ?contributor .
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER(NOT EXISTS { ?distribution dcat:mediaType ?x. })
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_distributions = all_distributions.merge(df,how='left')
# Interoperabilität - Format aus Vokabular
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?distribution) AS ?interoperability_format_from_vocabulary) WHERE {
?dataset a dcat:Dataset .
?dataset dcat:distribution ?distribution .
?distribution a dcat:Distribution .
?dataset dcatde:contributorID ?contributor .
OPTIONAL { ?distribution dct:format ?format }
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER( !regex(str(?format), "^http://publications.europa.eu/resource/authority/file-type/" ))
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_distributions = all_distributions.merge(df,how='left')
# Interoperabilität - Media Type aus Vokabular
query = """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcatde: <http://dcat-ap.de/def/dcatde/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?contributor (COUNT(?distribution) AS ?interoperability_media_type_from_vocabulary) WHERE {
?dataset a dcat:Dataset .
?dataset dcat:distribution ?distribution .
?distribution a dcat:Distribution .
?dataset dcatde:contributorID ?contributor .
OPTIONAL { ?distribution dct:format ?format }
FILTER( regex(str(?contributor), "^http://dcat-ap.de/def/contributors/" ) )
FILTER( !regex(str(?format), "^https://www.iana.org/assignments/media-types/" ))
} GROUP BY ?contributor"""
df = query_for_contributors(query)
all_distributions = all_distributions.merge(df,how='left')
all_distributions = all_distributions.fillna(0)
all_datasets = all_datasets.fillna(0)
#all_distributions.to_csv('distributions.csv')
#all_datasets.to_csv('datasets.csv')
relative = pd.DataFrame( all_datasets['contributor'])
for c in all_distributions.columns.drop(['contributor','total']):
relative[c] = 1-all_distributions[c].div(all_distributions['total'], axis=0)
relative_datasets = all_datasets[all_datasets.columns.drop(['total'])]
for c in all_datasets.columns.drop(['contributor','total']):
relative[c] = 1-all_datasets[c].div(all_datasets['total'], axis=0)
relative.set_index(['contributor'], inplace=True)
# relative.to_csv('relative.csv')
relative.index = relative.index.str[-26:]
def save_diagram( column, title):
col = []
for contributor in relative.index:
if relative.loc[contributor][column] < 0.5:
col.append('red')
elif relative.loc[contributor][column] < 1:
col.append('yellow')
else:
col.append('green')
plot = relative[column].plot.barh(figsize=(15,12),title=title, color=col)
plot.set_xlim(0,1)
fig = plot.get_figure()
fig.subplots_adjust(left=0.2, bottom=0.1, right=0.95, top=0.95)
fig.savefig(column+'.pdf')
fig.clf()
save_diagram('findability_time','Auffindbarkeit - Zeitbasierte Suche')
save_diagram('findability_geo','Auffindbarkeit - Ortsbezogene Suche')
save_diagram('accessibility_download','Zugänglichkeit - Download URL')
save_diagram('reusability_access_rights','Wiederverwendbarkeit - Zugangsbeschränkungsangaben')
save_diagram('reusability_license_information','Wiederverwendbarkeit - Lizenzangaben')
save_diagram('reusability_access_rights_vocabulary','Wiederverwendbarkeit - Zugangsbeschränkungsangaben aus Vokabular')
save_diagram('reusability_contact','Wiederverwendbarkeit - Kontaktinformation')
save_diagram('reusability_publisher','Wiederverwendbarkeit - Herausgeber')
save_diagram('contextuality_rights','Kontext - Rechte')
save_diagram('contextuality_dataset_modified','Kontext - Dataset: Änderungsdatum')
save_diagram('contextuality_dataset_issued','Kontext - Dataset: Ausstellungsdatum')
save_diagram('contextuality_distribution_modified','Kontext - Distribution: Änderungsdatum')
save_diagram('contextuality_distribution_issued','Kontext - Distribution: Ausstellungsdatum')
save_diagram('contextuality_file_size','Kontext - Dateigröße')
save_diagram('findability_keyword','Auffindbarkeit - Schlüsselwörter')
save_diagram('findability_category','Auffindbarkeit - Kategorien')
save_diagram('interoperability_format','Interoperabilität - Format')
save_diagram('interoperability_media_type','Interoperabilität - Media Type')
save_diagram('interoperability_format_from_vocabulary','Interoperabilität - Format aus Vokabular')
save_diagram('interoperability_media_type_from_vocabulary','Interoperabilität - Media Type aus Vokabular')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment