Last active
May 21, 2020 14:28
-
-
Save marcus-at-localhost/2881630444fbd69039c8 to your computer and use it in GitHub Desktop.
[Relative to absolute links with regex and DOMDocument] #domdocument #regex
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| function regexToAbs($text, $base) | |
| { | |
| if (empty($base)) | |
| return $text; | |
| // base url needs trailing / | |
| if (substr($base, -1, 1) != "/") | |
| $base .= "/"; | |
| $text = (string) trim($text); | |
| if (empty($text)) return $text; | |
| // http://wintermute.com.au/bits/2005-09/php-relative-absolute-links/ | |
| $text=preg_replace('#(href|src)="([^:"]*)(?:")#','$1="'.$base.'$2"',$text); | |
| //dd($text); | |
| return $text; | |
| } | |
| function domdocToAbs($text, $base) | |
| { | |
| if (empty($base)) | |
| return $text; | |
| // base url needs trailing / | |
| if (substr($base, -1, 1) != "/") | |
| $base .= "/"; | |
| $text = (string) trim($text); | |
| if (empty($text)) return $text; | |
| // parsing HTML with RegEx is bad they say | |
| // now compare this sensitive pile of crap, that chokes on | |
| // string encodings with the one line of regex on top of the page | |
| $html = $text; | |
| // you should know what you feed into | |
| $DOM = new DOMDocument(); | |
| $DOM->encoding = 'utf-8'; | |
| $DOM->loadHTML(utf8_decode($html)); | |
| // promised to leave out the additional DOCTYPE>html>body the result is wrapped with | |
| // but results in faulty HTML | |
| //$DOM->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD); | |
| $imgs = $DOM->getElementsByTagName('img'); | |
| foreach($imgs as $img){ | |
| $src = $img->getAttribute('src'); | |
| if(strpos($src, 'http') !== 0 || strpos($src, '/') !== 0){ | |
| $img->setAttribute('src', $base.$src); | |
| } | |
| } | |
| // remove the redundant doctype/html/body DOMDocument added with Regex | |
| // http://stackoverflow.com/a/10023094 | |
| $html = preg_replace('~<(?:!DOCTYPE|/?(?:html|body))[^>]*>\s*~i', '', $DOM->saveHTML()); | |
| return $html; | |
| } | |
| $input = <<<HTM | |
| <img src="relative.html" /> | |
| HTM; | |
| var_dump(regexToAbs($input,"http://www.de"), domdocToAbs($input,"http://www.de")); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment