Skip to content

Instantly share code, notes, and snippets.

@mwiemarc
Forked from haliphax/web2text.php
Created May 19, 2017 20:23
Show Gist options
  • Save mwiemarc/80ba5b0b0282452dbbb608af0a286515 to your computer and use it in GitHub Desktop.
Save mwiemarc/80ba5b0b0282452dbbb608af0a286515 to your computer and use it in GitHub Desktop.
<?php
/***
* Pull HTML source for given URL and format for output
* 2011 haliphax https://github.com/haliphax
**/
?>
<!DOCTYPE html>
<html>
<head>
<title>Format page as plain text</title>
</head>
<body>
<?php
# form to get URL
if(! array_key_exists('url', $_POST)):
?>
<form action="" method="POST">
<label for="url">Enter a URL:</label>
<br />
<input type="text" name="url" />
<br />
<button type="submit">Submit</button>
</form>
<?php
else:
$url = $_POST['url'];
# add protocol to URL if none supplied
if(! preg_match('#^(https?)://#i', $url))
$url = "http://{$url}";
$html = 0;
# pull source using best method available
if(function_exists('curl_init'))
{
# curl
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$html = curl_exec($ch);
curl_close($ch);
}
else if(ini_get('allow_url_fopen'))
{
# fopen
$html = file_get_contents($url);
}
else
{
# no method; exit
die('<p>Cannot retrieve HTML via curl nor fopen; exiting</p>');
}
# strip out newlines
$html = preg_replace('#\r|\n#', '', $html);
# strip out <script> and <style> blocks
$html = preg_replace('#<(script|style)[^>]*>.*?</\1>#is', '', $html);
# strip everything above the <body> element, if there is one
$html = preg_replace('#.*<body[^>]*>#is', '', $html);
# strip out html comments
$html = preg_replace('#<\!--.*?-->#s', '', $html);
# replace "img" tags with their "alt" attributes
$html = preg_replace('#<img[^>]*(?:alt\s*=\s*(\'|")(.*?)\1)[^>]*>#is', ' [\2] ', $html);
# replace list items with newline and indent
$html = preg_replace('#<(tr|li|dd|option|input[^>]+type\s*=\s*(\'|")?(checkbox|radio)\1)[^>]*>#is', "\n\t", $html);
# replace block-level elements with newline
$html = preg_replace('#<(noscript|p|br|div|h[0-9]+|form|fieldset|legend)[^>]*>#is', "\n", $html);
# replace other elements with spaces
$html = preg_replace('#<[^>]+?>#s', ' ', $html);
# replace &nbsp; with plain space
$html = preg_replace('#&nbsp;#', ' ', $html);
# reduce repeated spaces/tabs down to one each
$html = preg_replace('#[\t ]{2,}#', ' ', $html);
# reduce repeated newlines down to 2 each
$html = preg_replace('#(\s*\n\s*){2,}#', "\n \n", $html);
# left-trim
$html = preg_replace('#(^|\n) #', "\n", $html);
# output results
echo "<a href='scrape.php'>Try another URL</a><p><b>{$url}</b></p>";
echo "<pre>{$html}</pre>";
endif;
?>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment