mwiemarc · May 19, 2017 20:23
diff --git a/web2text.php b/web2text.php
 <?php
 /***
 * Pull HTML source for given URL and format for output
 * 2011 haliphax https://github.com/haliphax
 **/
 ?>
 <!DOCTYPE html>
 <html>
 <head>
 	<title>Format page as plain text</title>
 </head>
 <body>
 <?php

 # form to get URL
 if(! array_key_exists('url', $_POST)):
 ?>
 <form action="" method="POST">
 	<label for="url">Enter a URL:</label>
 	<br />
 	<input type="text" name="url" />
 	<br />
 	<button type="submit">Submit</button>
 </form>
 <?php
 else:

 $url = $_POST['url'];
 # add protocol to URL if none supplied
 if(! preg_match('#^(https?)://#i', $url))
 	$url = "http://{$url}";
 $html = 0;

 # pull source using best method available
 if(function_exists('curl_init'))
 {
 	# curl
 	$ch = curl_init($url);
 	curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
 	curl_setopt($ch, CURLOPT_HEADER, false);
 	curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 	$html = curl_exec($ch);
 	curl_close($ch);
 }
 else if(ini_get('allow_url_fopen'))
 {
 	# fopen
 	$html = file_get_contents($url);
 }
 else
 {
 	# no method; exit
 	die('<p>Cannot retrieve HTML via curl nor fopen; exiting</p>');
 }

 # strip out newlines
 $html = preg_replace('#\r|\n#', '', $html);
 # strip out <script> and <style> blocks
 $html = preg_replace('#<(script|style)[^>]*>.*?</\1>#is', '', $html);
 # strip everything above the <body> element, if there is one
 $html = preg_replace('#.*<body[^>]*>#is', '', $html);
 # strip out html comments
 $html = preg_replace('#<\!--.*?-->#s', '', $html);
 # replace "img" tags with their "alt" attributes
 $html = preg_replace('#<img[^>]*(?:alt\s*=\s*(\'|")(.*?)\1)[^>]*>#is', ' [\2] ', $html);
 # replace list items with newline and indent
 $html = preg_replace('#<(tr|li|dd|option|input[^>]+type\s*=\s*(\'|")?(checkbox|radio)\1)[^>]*>#is', "\n\t", $html);
 # replace block-level elements with newline
 $html = preg_replace('#<(noscript|p|br|div|h[0-9]+|form|fieldset|legend)[^>]*>#is', "\n", $html);
 # replace other elements with spaces
 $html = preg_replace('#<[^>]+?>#s', ' ', $html);
 # replace &nbsp; with plain space
 $html = preg_replace('#&nbsp;#', ' ', $html);
 # reduce repeated spaces/tabs down to one each
 $html = preg_replace('#[\t ]{2,}#', ' ', $html);
 # reduce repeated newlines down to 2 each
 $html = preg_replace('#(\s*\n\s*){2,}#', "\n \n", $html);
 # left-trim
 $html = preg_replace('#(^|\n) #', "\n", $html);

 # output results
 echo "<a href='scrape.php'>Try another URL</a><p><b>{$url}</b></p>";
 echo "<pre>{$html}</pre>";

 endif;
 ?>
 </body>
 </html>
	<?php
	/***
	* Pull HTML source for given URL and format for output
	* 2011 haliphax https://github.com/haliphax
	**/
	?>
	<!DOCTYPE html>
	<html>
	<head>
	<title>Format page as plain text</title>
	</head>
	<body>
	<?php

	# form to get URL
	if(! array_key_exists('url', $_POST)):
	?>
	<form action="" method="POST">
	<label for="url">Enter a URL:</label>
	<br />
	<input type="text" name="url" />
	<br />
	<button type="submit">Submit</button>
	</form>
	<?php
	else:

	$url = $_POST['url'];
	# add protocol to URL if none supplied
	if(! preg_match('#^(https?)://#i', $url))
	$url = "http://{$url}";
	$html = 0;

	# pull source using best method available
	if(function_exists('curl_init'))
	{
	# curl
	$ch = curl_init($url);
	curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
	curl_setopt($ch, CURLOPT_HEADER, false);
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
	$html = curl_exec($ch);
	curl_close($ch);
	}
	else if(ini_get('allow_url_fopen'))
	{
	# fopen
	$html = file_get_contents($url);
	}
	else
	{
	# no method; exit
	die('<p>Cannot retrieve HTML via curl nor fopen; exiting</p>');
	}

	# strip out newlines
	$html = preg_replace('#\r\|\n#', '', $html);
	# strip out <script> and <style> blocks
	$html = preg_replace('#<(script\|style)[^>]>.?</\1>#is', '', $html);
	# strip everything above the <body> element, if there is one
	$html = preg_replace('#.<body[^>]>#is', '', $html);
	# strip out html comments
	$html = preg_replace('#<\!--.*?-->#s', '', $html);
	# replace "img" tags with their "alt" attributes
	$html = preg_replace('#<img[^>](?:alt\s=\s(\'\|")(.?)\1)[^>]*>#is', ' [\2] ', $html);
	# replace list items with newline and indent
	$html = preg_replace('#<(tr\|li\|dd\|option\|input[^>]+type\s=\s(\'\|")?(checkbox\|radio)\1)[^>]*>#is', "\n\t", $html);
	# replace block-level elements with newline
	$html = preg_replace('#<(noscript\|p\|br\|div\|h[0-9]+\|form\|fieldset\|legend)[^>]*>#is', "\n", $html);
	# replace other elements with spaces
	$html = preg_replace('#<[^>]+?>#s', ' ', $html);
	# replace   with plain space
	$html = preg_replace('# #', ' ', $html);
	# reduce repeated spaces/tabs down to one each
	$html = preg_replace('#[\t ]{2,}#', ' ', $html);
	# reduce repeated newlines down to 2 each
	$html = preg_replace('#(\s\n\s){2,}#', "\n \n", $html);
	# left-trim
	$html = preg_replace('#(^\|\n) #', "\n", $html);

	# output results
	echo "<a href='scrape.php'>Try another URL</a><p><b>{$url}</b></p>";
	echo "<pre>{$html}</pre>";

	endif;
	?>
	</body>
	</html>