ubermichael · March 11, 2021 17:51
diff --git a/extract.php b/extract.php
   $nodes = $xp->query('/tei:TEI/tei:text//node()[self::text() or self::tei:pb]');
    for($n = 0; $n < $nodes->length; $n++) {
        $node = $nodes->item($n);
        if($node instanceof DOMNode && $node->nodeType === XML_ELEMENT_NODE) {
            $pageCount++;
            $content = preg_replace("/[[:space:]]{2,}/u", ' ', $text) . "\n";
            $fn = sprintf("%s/%s_%04d", $dir, $id, $pageCount);
            file_put_contents($fn, $content);
            $text = '';
        }
        if($node instanceof DOMText) {
            $text .= $node->textContent . " ";
        }
    }
	$nodes = $xp->query('/tei:TEI/tei:text//node()[self::text() or self::tei:pb]');
	for($n = 0; $n < $nodes->length; $n++) {
	$node = $nodes->item($n);
	if($node instanceof DOMNode && $node->nodeType === XML_ELEMENT_NODE) {
	$pageCount++;
	$content = preg_replace("/[[:space:]]{2,}/u", ' ', $text) . "\n";
	$fn = sprintf("%s/%s_%04d", $dir, $id, $pageCount);
	file_put_contents($fn, $content);
	$text = '';
	}
	if($node instanceof DOMText) {
	$text .= $node->textContent . " ";
	}
	}
No results found