Created
July 9, 2013 20:22
-
-
Save jamesstout/5960938 to your computer and use it in GitHub Desktop.
Scans http://brettterpstra.com/otherstuff/systematic-linkage/ and attempts to output the "Top Picks" in markdown format producing HTML like this: http://cloud.zoooot.com/Q8Wr
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
$curl_error_str = ""; | |
$curl_error_no = 0; | |
$httpCode = 0; | |
$curl_info = ""; | |
$page=""; | |
$html=""; | |
#echo mb_internal_encoding() . "\n"; | |
#mb_internal_encoding("UTF-8"); | |
#echo mb_internal_encoding() . "\n"; | |
$url = 'http://brettterpstra.com/otherstuff/systematic-linkage/'; | |
if (!$page = CURL($url)){ | |
$str = "\nScript: " . $_SERVER["SCRIPT_FILENAME"] . "\nFile: " . __FILE__ . "\nFunction: " | |
. __FUNCTION__ . "() - line: " . __LINE__ . "\nERROR($curl_error_no) - $curl_error_str\nHTTP Response Code: $httpCode\nCURL INFO:\n$curl_info"; | |
die("ERROR getting $url: $str"); | |
} | |
else{ | |
// strip out all the header stuff | |
$start = '<h3 id="introductions-all-around">'; | |
$end = '<footer>'; | |
$html = GetBetween($page,$start,$end ); | |
//assume no error from GetBetween - TODO | |
// IMP - add in the encoding = UTF-8, otherwise DOMDocument returns nodevalues in ASCII even though all chars may not be valid | |
$html = '<?xml version="1.0" encoding="utf-8" ?><h3 id="introductions-all-around">' . $html; | |
} | |
// create DOM | |
$xml = new DOMDocument(); | |
$xml->loadHTML($html); | |
foreach ( $xml->getElementsByTagName( 'a' ) as $link ) { | |
if ( $link->getAttribute( 'href' )){ | |
if(isMaybeATopPick($link->getAttribute( 'href' ), $link->nodeValue ) == true){ | |
// web page is encoded as UTF-8 but $link->nodeValue was coming back as ASCII | |
// until I added the xml enc in the html | |
// hence this faffing about | |
// echo mb_detect_encoding($link->nodeValue); | |
// echo " "; | |
// echo mb_detect_encoding($link->getAttribute( 'href' )); | |
// echo " "; | |
// echo($link->getAttribute( 'href' ) . " = "); | |
// print($link->nodeValue); | |
// echo "\n"; | |
//$string = iconv('ASCII', 'UTF-8//IGNORE', $link->nodeValue); | |
//$string= preg_replace ('/([\x80-\xff])/se', "pack (\"C*\", (ord ($1) >> 6) | 0xc0, (ord ($1) & 0x3f) | 0x80)", $link->nodeValue); | |
//echo($link->getAttribute( 'href' ) . " = " . $link->nodeValue . "\n"); | |
echo "* [$link->nodeValue](" . $link->getAttribute( 'href' ) . ")\n"; | |
} | |
} | |
} | |
exit; | |
function isMaybeATopPick($url, $text){ | |
$picks = array( "app", "mac", "itunes.apple.com", "OS X", "iOS", "iTunes", "iPhone", "iPad", "Apple" ); | |
foreach ($picks as $needle) { | |
if( stripos($url, $needle) !== false || stripos($text, $needle) !== false){ | |
if( andIsNotOneOfThese($url, $text) == true){ | |
return true; | |
} | |
} | |
} | |
return false; | |
} | |
function andIsNotOneOfThese($url, $text){ | |
$nonPicks = array( "alpha.app.net", "join.app.net", "minimalmac", "http://macsparky.com", "macdrifter.com" , "https://twitter.com", "Frakes", "MacStories", "5by5.tv/mpu", "www.theverge.com"); | |
foreach ($nonPicks as $needle) { | |
if( stripos($text, $needle) !== false || stripos($url, $needle) !== false){ | |
return false; | |
} | |
} | |
return true; | |
} | |
function GetBetween($content,$start,$end){ | |
$r = explode($start, $content); | |
if (isset($r[1])){ | |
$r = explode($end, $r[1]); | |
return $r[0]; | |
} | |
return ''; | |
} | |
function CURL($url, $retries = 3) | |
{ | |
global $httpCode; | |
global $curl_error_str; | |
global $curl_error_no; | |
global $curl_info; | |
$timeout = 5; | |
$connectionTimeout = 2; | |
$curl = curl_init($url); | |
if (is_resource($curl) === true) | |
{ | |
curl_setopt($curl, CURLOPT_FAILONERROR, true); | |
curl_setopt($curl, CURLOPT_TIMEOUT, $timeout); | |
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); | |
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false); | |
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); | |
curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, $connectionTimeout); | |
curl_setopt($curl, CURLOPT_NOSIGNAL, 1); | |
//curl_setopt($curl, CURLOPT_VERBOSE, true); | |
$result = false; | |
while (($result === false) && (--$retries > 0)) | |
{ | |
//echo "CURL retries = $retries\n"; | |
$result = curl_exec($curl); | |
} | |
if(!$result){ | |
$httpCode = curl_getinfo($curl, CURLINFO_HTTP_CODE); | |
$curl_info_temp = curl_getinfo($curl); | |
if(is_array($curl_info_temp) == TRUE){ | |
$curl_info = var_export($curl_info_temp, 1); | |
} | |
} | |
// Check if any error occured | |
if(curl_errno($curl)) | |
{ | |
$curl_error_str = curl_error($curl); | |
$curl_error_no = curl_errno($curl); | |
//echo 'Curl error: ' . curl_error($curl); | |
//echo 'Curl error no: ' . curl_errno($curl); | |
} | |
curl_close($curl); | |
} | |
else{ | |
$curl_error_str = "curl_init() failed for $url"; | |
} | |
return $result; | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment