Created
November 2, 2012 09:01
-
-
Save dmj/3999608 to your computer and use it in GitHub Desktop.
Split the German National Library's authority file in single entities
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;;; | |
;; Split the German National Library's authority file in single entities | |
;; | |
;; The German National Library (DNB) published their authority file | |
;; under a CC0 license as one huge Turtle file. In order to work with | |
;; the data e.g. as training material for a machine learning based NER | |
;; of person names we need to split the file in the separate entities | |
;; (subjects). | |
;; | |
;; Number of single records: 9,493,986 | |
;; Evaluation took: | |
;; 7933.054 seconds of real time | |
;; 4690.561142 seconds of total run time (2912.670031 user, 1777.891111 system) | |
;; [ Run times consist of 97.449 seconds GC time, and 4593.113 seconds non-GC time. ] | |
;; 59.13% CPU | |
;; 12 lambdas converted | |
;; 17,864,846,985,033 processor cycles | |
;; 16 page faults | |
;; 359,893,585,408 bytes consed | |
;; | |
;; 9,943,986 single records in 7933 seconds equals 1253 records per | |
;; second. With a script written in a full featured high level | |
;; language. Awesome. | |
;; | |
;; Author: David Maus <[email protected]> | |
;; | |
;; Developed in the course of Lower Saxonia's Digital Humanities | |
;; Research Collaboration <http://www.gcdh.de/en/projects/dh/>. | |
;; | |
;;; | |
(require 'cl-ppcre) | |
(use-package 'cl-ppcre) | |
(defconstant turtle-prefix-scanner (create-scanner "^@prefix[ ]+([^ ]+)")) | |
(defconstant turtle-subject-scanner (create-scanner "^<http://d-nb\\.info/([^>]+)>")) | |
(defconstant turtle-type-scanner (create-scanner "^[ ]+a[ ]+([^ ]+)")) | |
(defconstant dnb-infile "gnd-2012-07-26.ttl") | |
(defconstant dnb-workdir "/media/sdb1/d-nb.info") | |
(defmacro dnb-match (scanner line) | |
`(multiple-value-bind (match matches) (scan-to-strings ,scanner ,line) | |
(when match (setf match-data matches) t))) | |
(defmacro match-data-token (match-data) | |
`(aref ,match-data 0)) | |
(defmacro dnb-end-of-entity-p (line) | |
`(and (string> line "") (char= #\. (char ,line (1- (length ,line)))))) | |
(defun dnb-write-entity (prefix id type content) | |
(let ((outfile (concatenate 'string dnb-workdir "/entities/" type "/" id ".ttl"))) | |
(ensure-directories-exist outfile) | |
(with-open-file (stream outfile :direction :output :if-exists :supersede) | |
(format stream "~{~A~%~}~{~A~%~}" (mapcar #'cdr prefix) content)))) | |
(defun dnb-split (file) | |
(with-open-file (stream file :direction :input) | |
(let ((counter 0) | |
match-data prefix entity-id entity-content entity-type) | |
(do ((line (read-line stream nil) (read-line stream nil))) | |
((not line)) | |
(handler-case | |
(cond | |
((dnb-match turtle-prefix-scanner line) | |
(let ((token (match-data-token match-data))) | |
(unless (assoc token prefix :test #'string=) | |
(push (cons token line) prefix)))) | |
((dnb-match turtle-subject-scanner line) | |
(setf entity-id (match-data-token match-data)) | |
(push line entity-content)) | |
((dnb-match turtle-type-scanner line) | |
(setf entity-type (match-data-token match-data)) | |
(push line entity-content)) | |
((dnb-end-of-entity-p line) | |
(push line entity-content) | |
(dnb-write-entity prefix entity-id entity-type (nreverse entity-content)) | |
(incf counter) | |
(setf entity-id nil | |
entity-type nil | |
entity-content nil)) | |
(t (when entity-content (push line entity-content)))))) | |
counter))) | |
(time | |
(format t "~20D" (dnb-split (concatenate 'string dnb-workdir "/" dnb-infile)))) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
$prefix = array(); | |
$gnd = null; | |
$matches = array(); | |
$bucket = array(); | |
$fh = fopen('gnd-2012-07-26.ttl', 'r'); | |
do { | |
$line = fgets($fh); | |
switch (true) { | |
case (preg_match('/^@prefix/', $line)): | |
$prefix []= $line; | |
break; | |
case (preg_match('@\.$@', $line)): | |
$bucket []= $line; | |
if ($gnd == null) { | |
die("End of entity, but no subject"); | |
} | |
if (empty($bucket)) { | |
die("Empty bucket for subject {$gnd}"); | |
} | |
file_put_contents("gnd/{$gnd}.ttl", $prefix); | |
file_put_contents("gnd/{$gnd}.ttl", $bucket, \FILE_APPEND); | |
echo "Wrote gnd/{$gnd}.ttl\n"; | |
$gnd = null; | |
$bucket = array(); | |
break; | |
case (preg_match('@^<http://d-nb.info/gnd/([^>]+)>@', $line, $matches)): | |
$gnd = $matches[1]; | |
$matches = array(); | |
default: | |
$bucket []= $line; | |
} | |
} while (!feof($fh)); | |
fclose($fh); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment