marcus-at-localhost · May 21, 2020 14:28
diff --git a/relabsolut.php b/relabsolut.php
 <?php

 function regexToAbs($text, $base)
 	{
 		
 		if (empty($base))
 		return $text;
 		// base url needs trailing /
 		if (substr($base, -1, 1) != "/")
 		$base .= "/";

 		$text = (string) trim($text);

 		if (empty($text)) return $text;

 		// http://wintermute.com.au/bits/2005-09/php-relative-absolute-links/
 		$text=preg_replace('#(href|src)="([^:"]*)(?:")#','$1="'.$base.'$2"',$text);

 		//dd($text);
 		return $text;

 	}
 	
 	
 	
 	function domdocToAbs($text, $base)
 	{
 		
 		if (empty($base))
 		return $text;
 		// base url needs trailing /
 		if (substr($base, -1, 1) != "/")
 		$base .= "/";

 		$text = (string) trim($text);

 		if (empty($text)) return $text;
 	
 		// parsing HTML with RegEx is bad they say
 		// now compare this sensitive pile of crap, that chokes on 
 		// string encodings with the one line of regex on top of the page

 		$html = $text;
 		
 		// you should know what you feed into
 		$DOM = new DOMDocument();
 		$DOM->encoding = 'utf-8';
    
 		$DOM->loadHTML(utf8_decode($html));

    // promised to leave out the additional DOCTYPE>html>body the result is wrapped with
 		// but results in faulty HTML
 		//$DOM->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);

 		$imgs = $DOM->getElementsByTagName('img');
 		foreach($imgs as $img){
 		    $src = $img->getAttribute('src');
 		    if(strpos($src, 'http') !== 0 || strpos($src, '/') !== 0){
 		        $img->setAttribute('src', $base.$src);
 		    }
 		}

 		// remove the redundant doctype/html/body DOMDocument added with Regex
 		// http://stackoverflow.com/a/10023094
 		$html = preg_replace('~<(?:!DOCTYPE|/?(?:html|body))[^>]*>\s*~i', '', $DOM->saveHTML());

 		return $html;
 	}
 	
 	$input = <<<HTM
 	<img src="relative.html" />
 	HTM;
 	
 	var_dump(regexToAbs($input,"http://www.de"), domdocToAbs($input,"http://www.de"));
	<?php

	function regexToAbs($text, $base)
	{

	if (empty($base))
	return $text;
	// base url needs trailing /
	if (substr($base, -1, 1) != "/")
	$base .= "/";

	$text = (string) trim($text);

	if (empty($text)) return $text;

	// http://wintermute.com.au/bits/2005-09/php-relative-absolute-links/
	$text=preg_replace('#(href\|src)="([^:"]*)(?:")#','$1="'.$base.'$2"',$text);

	//dd($text);
	return $text;

	}



	function domdocToAbs($text, $base)
	{

	if (empty($base))
	return $text;
	// base url needs trailing /
	if (substr($base, -1, 1) != "/")
	$base .= "/";

	$text = (string) trim($text);

	if (empty($text)) return $text;

	// parsing HTML with RegEx is bad they say
	// now compare this sensitive pile of crap, that chokes on
	// string encodings with the one line of regex on top of the page

	$html = $text;

	// you should know what you feed into
	$DOM = new DOMDocument();
	$DOM->encoding = 'utf-8';

	$DOM->loadHTML(utf8_decode($html));

	// promised to leave out the additional DOCTYPE>html>body the result is wrapped with
	// but results in faulty HTML
	//$DOM->loadHTML($html, LIBXML_HTML_NOIMPLIED \| LIBXML_HTML_NODEFDTD);

	$imgs = $DOM->getElementsByTagName('img');
	foreach($imgs as $img){
	$src = $img->getAttribute('src');
	if(strpos($src, 'http') !== 0 \|\| strpos($src, '/') !== 0){
	$img->setAttribute('src', $base.$src);
	}
	}

	// remove the redundant doctype/html/body DOMDocument added with Regex
	// http://stackoverflow.com/a/10023094
	$html = preg_replace('~<(?:!DOCTYPE\|/?(?:html\|body))[^>]>\s~i', '', $DOM->saveHTML());

	return $html;
	}

	$input = <<<HTM
	<img src="relative.html" />
	HTM;

	var_dump(regexToAbs($input,"http://www.de"), domdocToAbs($input,"http://www.de"));
No results found