-
-
Save mwiemarc/80ba5b0b0282452dbbb608af0a286515 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/*** | |
* Pull HTML source for given URL and format for output | |
* 2011 haliphax https://github.com/haliphax | |
**/ | |
?> | |
<!DOCTYPE html> | |
<html> | |
<head> | |
<title>Format page as plain text</title> | |
</head> | |
<body> | |
<?php | |
# form to get URL | |
if(! array_key_exists('url', $_POST)): | |
?> | |
<form action="" method="POST"> | |
<label for="url">Enter a URL:</label> | |
<br /> | |
<input type="text" name="url" /> | |
<br /> | |
<button type="submit">Submit</button> | |
</form> | |
<?php | |
else: | |
$url = $_POST['url']; | |
# add protocol to URL if none supplied | |
if(! preg_match('#^(https?)://#i', $url)) | |
$url = "http://{$url}"; | |
$html = 0; | |
# pull source using best method available | |
if(function_exists('curl_init')) | |
{ | |
# curl | |
$ch = curl_init($url); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
curl_setopt($ch, CURLOPT_HEADER, false); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
$html = curl_exec($ch); | |
curl_close($ch); | |
} | |
else if(ini_get('allow_url_fopen')) | |
{ | |
# fopen | |
$html = file_get_contents($url); | |
} | |
else | |
{ | |
# no method; exit | |
die('<p>Cannot retrieve HTML via curl nor fopen; exiting</p>'); | |
} | |
# strip out newlines | |
$html = preg_replace('#\r|\n#', '', $html); | |
# strip out <script> and <style> blocks | |
$html = preg_replace('#<(script|style)[^>]*>.*?</\1>#is', '', $html); | |
# strip everything above the <body> element, if there is one | |
$html = preg_replace('#.*<body[^>]*>#is', '', $html); | |
# strip out html comments | |
$html = preg_replace('#<\!--.*?-->#s', '', $html); | |
# replace "img" tags with their "alt" attributes | |
$html = preg_replace('#<img[^>]*(?:alt\s*=\s*(\'|")(.*?)\1)[^>]*>#is', ' [\2] ', $html); | |
# replace list items with newline and indent | |
$html = preg_replace('#<(tr|li|dd|option|input[^>]+type\s*=\s*(\'|")?(checkbox|radio)\1)[^>]*>#is', "\n\t", $html); | |
# replace block-level elements with newline | |
$html = preg_replace('#<(noscript|p|br|div|h[0-9]+|form|fieldset|legend)[^>]*>#is', "\n", $html); | |
# replace other elements with spaces | |
$html = preg_replace('#<[^>]+?>#s', ' ', $html); | |
# replace with plain space | |
$html = preg_replace('# #', ' ', $html); | |
# reduce repeated spaces/tabs down to one each | |
$html = preg_replace('#[\t ]{2,}#', ' ', $html); | |
# reduce repeated newlines down to 2 each | |
$html = preg_replace('#(\s*\n\s*){2,}#', "\n \n", $html); | |
# left-trim | |
$html = preg_replace('#(^|\n) #', "\n", $html); | |
# output results | |
echo "<a href='scrape.php'>Try another URL</a><p><b>{$url}</b></p>"; | |
echo "<pre>{$html}</pre>"; | |
endif; | |
?> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment