Skip to content

Instantly share code, notes, and snippets.

@dmj
Created November 2, 2012 09:01
Show Gist options
  • Save dmj/3999608 to your computer and use it in GitHub Desktop.
Save dmj/3999608 to your computer and use it in GitHub Desktop.
Split the German National Library's authority file in single entities
;;;
;; Split the German National Library's authority file in single entities
;;
;; The German National Library (DNB) published their authority file
;; under a CC0 license as one huge Turtle file. In order to work with
;; the data e.g. as training material for a machine learning based NER
;; of person names we need to split the file in the separate entities
;; (subjects).
;;
;; Number of single records: 9,493,986
;; Evaluation took:
;; 7933.054 seconds of real time
;; 4690.561142 seconds of total run time (2912.670031 user, 1777.891111 system)
;; [ Run times consist of 97.449 seconds GC time, and 4593.113 seconds non-GC time. ]
;; 59.13% CPU
;; 12 lambdas converted
;; 17,864,846,985,033 processor cycles
;; 16 page faults
;; 359,893,585,408 bytes consed
;;
;; 9,943,986 single records in 7933 seconds equals 1253 records per
;; second. With a script written in a full featured high level
;; language. Awesome.
;;
;; Author: David Maus <[email protected]>
;;
;; Developed in the course of Lower Saxonia's Digital Humanities
;; Research Collaboration <http://www.gcdh.de/en/projects/dh/>.
;;
;;;
(require 'cl-ppcre)
(use-package 'cl-ppcre)
(defconstant turtle-prefix-scanner (create-scanner "^@prefix[ ]+([^ ]+)"))
(defconstant turtle-subject-scanner (create-scanner "^<http://d-nb\\.info/([^>]+)>"))
(defconstant turtle-type-scanner (create-scanner "^[ ]+a[ ]+([^ ]+)"))
(defconstant dnb-infile "gnd-2012-07-26.ttl")
(defconstant dnb-workdir "/media/sdb1/d-nb.info")
(defmacro dnb-match (scanner line)
`(multiple-value-bind (match matches) (scan-to-strings ,scanner ,line)
(when match (setf match-data matches) t)))
(defmacro match-data-token (match-data)
`(aref ,match-data 0))
(defmacro dnb-end-of-entity-p (line)
`(and (string> line "") (char= #\. (char ,line (1- (length ,line))))))
(defun dnb-write-entity (prefix id type content)
(let ((outfile (concatenate 'string dnb-workdir "/entities/" type "/" id ".ttl")))
(ensure-directories-exist outfile)
(with-open-file (stream outfile :direction :output :if-exists :supersede)
(format stream "~{~A~%~}~{~A~%~}" (mapcar #'cdr prefix) content))))
(defun dnb-split (file)
(with-open-file (stream file :direction :input)
(let ((counter 0)
match-data prefix entity-id entity-content entity-type)
(do ((line (read-line stream nil) (read-line stream nil)))
((not line))
(handler-case
(cond
((dnb-match turtle-prefix-scanner line)
(let ((token (match-data-token match-data)))
(unless (assoc token prefix :test #'string=)
(push (cons token line) prefix))))
((dnb-match turtle-subject-scanner line)
(setf entity-id (match-data-token match-data))
(push line entity-content))
((dnb-match turtle-type-scanner line)
(setf entity-type (match-data-token match-data))
(push line entity-content))
((dnb-end-of-entity-p line)
(push line entity-content)
(dnb-write-entity prefix entity-id entity-type (nreverse entity-content))
(incf counter)
(setf entity-id nil
entity-type nil
entity-content nil))
(t (when entity-content (push line entity-content))))))
counter)))
(time
(format t "~20D" (dnb-split (concatenate 'string dnb-workdir "/" dnb-infile))))
<?php
$prefix = array();
$gnd = null;
$matches = array();
$bucket = array();
$fh = fopen('gnd-2012-07-26.ttl', 'r');
do {
$line = fgets($fh);
switch (true) {
case (preg_match('/^@prefix/', $line)):
$prefix []= $line;
break;
case (preg_match('@\.$@', $line)):
$bucket []= $line;
if ($gnd == null) {
die("End of entity, but no subject");
}
if (empty($bucket)) {
die("Empty bucket for subject {$gnd}");
}
file_put_contents("gnd/{$gnd}.ttl", $prefix);
file_put_contents("gnd/{$gnd}.ttl", $bucket, \FILE_APPEND);
echo "Wrote gnd/{$gnd}.ttl\n";
$gnd = null;
$bucket = array();
break;
case (preg_match('@^<http://d-nb.info/gnd/([^>]+)>@', $line, $matches)):
$gnd = $matches[1];
$matches = array();
default:
$bucket []= $line;
}
} while (!feof($fh));
fclose($fh);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment