Created
August 1, 2010 17:31
-
-
Save RdeWilde/503555 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
if (!$sContent = file_get_contents('http://www.biblija.net/biblija.cgi?m=lc1,1-10&id42=0&l=nl')) | |
throw new Exception('Tekst kon niet geladen worden'); | |
// Try to get encoding right, lose the strange encoding chars | |
$sContent = mb_convert_encoding($sContent, 'UTF-8', mb_detect_encoding($sContent)); | |
$sContent = mb_convert_encoding($sContent, 'html-entities', 'UTF-8'); | |
$oDOM = new DOMDocument('1.0'); | |
//$oDOM->resolveExternals = false; | |
//$oDOM->substituteEntities = false; | |
if (!$oDOM->loadHTML($sContent)) | |
throw new Exception('Kon tekst niet parsen'); | |
//if (strstr($oDOM->documentElement->textContent, 'Â')) | |
// throw new Exception('Get the bastard!'); | |
// Let op, alle tags en attributes zijn naar lowercase omgezet. | |
if (!$oQry = new DOMXPath($oDOM)) | |
throw new Exception('Could not parse XPath query'); | |
$oTexts = $oQry->query('//td[@class = "text"]'); | |
// The element containing the content | |
foreach ($oTexts as $oText) | |
{ | |
$oParas = $oQry->query('div', $oText); | |
// Loop through all parts found | |
foreach ($oParas as $oPara) | |
{ | |
// What sort of chapter we have? | |
switch ($oPara->getAttribute('class')) | |
{ | |
case 'm': | |
case 'p': | |
// Doorloop childs, check nodeType (text, element > span? met class 'v') of doe een regexp-setje ;) | |
// \v = vertical whitespace | |
$aVerses = preg_split('/\v/is', $oPara->textContent); | |
foreach ($aVerses as $sVerse) | |
{ | |
if (trim($sVerse) != '') | |
{ | |
// \h = horizontal whitespace, later limit 2 | |
$aResult = preg_split('/\h+/is', $sVerse, 2); | |
// Fix strange encoding bug | |
foreach ($aResult as $iIndex => $sResult) $aResult[$iIndex] = preg_replace('/Â/','',$sResult); | |
print_r($aResult); | |
} | |
} | |
break; | |
default: | |
//throw new Exception('Paragraph-type `'.$oPara->getAttribute('class').'` is not handled.'); | |
} | |
} | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment