Created
June 21, 2023 07:44
-
-
Save DrDub/f1c150eac52352751d5f1c208ee91f16 to your computer and use it in GitHub Desktop.
Detagging HTML to do pre-trained transformers fine-tuning
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// This code Copyright (C) 2023 Textualization Sofware Ltd. is dual | |
// licensed PHP and LGPLv2.1 and it comes with NO WARRANTIES. | |
// v1.0 | |
/** | |
How to use it to obtain text for fine-tuning transformer models | |
--------------------------------------------------------------- | |
Assuming a crawl/ folder with a full crawl of websites of interest, with rendered DOM documents with extension .html | |
php detag.php > html_php.txt 2> html_php.log | |
Assuming a Linux machine with the following programs installed: | |
* pandoc | |
* pdftotext | |
* antiword | |
find crawl -follow -name \*.doc -exec antiword \{} \; > doc_antiword.txt 2> doc_antiword.log | |
find crawl -follow -name \*.docx -exec pandoc --to plain \{} \; > docx_pandoc.txt 2> docx_pandoc.log | |
find crawl -follow -name \*.pdf -exec pdftotext -nopgbrk \{} - \; > pdf_pdftotext.txt 2> pdf_pdftotext.log | |
Then merge al the files together: | |
cat html_php.txt doc_antiword.txt docx_pandoc.txt pdf_pdftotext.txt > all.txt | |
*/ | |
function getHtmls(string $folder): array { | |
$result = []; | |
foreach (new DirectoryIterator($folder) as $fileInfo) { | |
if($fileInfo->isDot()) continue; | |
if($fileInfo->isdir()) { | |
$rec = getHtmls($fileInfo->getPathname()); | |
$result = array_merge($result, $rec); | |
continue; | |
} | |
if($fileInfo->getExtension() == 'html'){ | |
$result[] = $fileInfo->getPathname(); | |
} | |
} | |
return $result; | |
} | |
$htmls = getHtmls("crawl/"); | |
// array_flip makes a set easily | |
$HTML5_INLINE = array_flip([ | |
"a", "abbr", "acronym", "audio", "b", "bdi", "bdo", "big", "br", "button", "canvas", "cite", "code", "data", "del", "dfn", "em", "embed", "i", "iframe", "img", "input", "ins", "kbd", "label", "map", "mark", "meter", "noscript", "object", "output", "picture", "progress", "q", "ruby", "s", "samp", "script", "select", "slot", "small", "span", "strong", "sub", "sup", "svg", "template", "textarea", "time", "u", "tt", "var", "video", "wbr", | |
]); // from https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements#list_of_inline_elements | |
$PRUNE_TAGS = array_flip([ 'head', 'script', 'style' ]); | |
$END_CHARS = array_flip([ '.', ':', '?', '!' ]); | |
//turning off some errors | |
libxml_use_internal_errors(true); | |
// add a period at the end of "visible elements" like a paragraph, a div or a title. | |
function recurse(DOMNode $node) : string { | |
global $PRUNE_TAGS; | |
global $HTML5_INLINE; | |
global $END_CHARS; | |
if($node->nodeType == XML_TEXT_NODE) | |
return $node->nodeValue; | |
if($node->nodeName && array_key_exists($node->nodeName, $PRUNE_TAGS)) | |
return ""; | |
$result = ""; | |
foreach ($node->childNodes as $childNode) { | |
$rec = recurse($childNode); | |
if(array_key_exists($childNode->nodeName, $HTML5_INLINE)){ | |
$result = "$result$rec"; | |
}else{ | |
$trimmed = trim($rec); | |
$len = strlen($trimmed); | |
if($len){ | |
if(! array_key_exists($trimmed[$len-1], $END_CHARS)){ | |
$rec = "$rec."; | |
} | |
$result = "$result$rec\n"; | |
} | |
} | |
} | |
return $result; | |
} | |
foreach($htmls as $html) { | |
$content = file_get_contents($html); | |
if(empty($content)) | |
continue; | |
$content = mb_convert_encoding($content, 'HTML-ENTITIES', 'UTF-8'); | |
$doc = new DOMDocument('1.0', 'utf-8'); | |
$doc->LoadHTML($content, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD); | |
print(recurse($doc)); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment