Skip to content

Instantly share code, notes, and snippets.

@jamesstout
Created July 9, 2013 20:22
Show Gist options
  • Save jamesstout/5960938 to your computer and use it in GitHub Desktop.
Save jamesstout/5960938 to your computer and use it in GitHub Desktop.
Scans http://brettterpstra.com/otherstuff/systematic-linkage/ and attempts to output the "Top Picks" in markdown format producing HTML like this: http://cloud.zoooot.com/Q8Wr
<?php
$curl_error_str = "";
$curl_error_no = 0;
$httpCode = 0;
$curl_info = "";
$page="";
$html="";
#echo mb_internal_encoding() . "\n";
#mb_internal_encoding("UTF-8");
#echo mb_internal_encoding() . "\n";
$url = 'http://brettterpstra.com/otherstuff/systematic-linkage/';
if (!$page = CURL($url)){
$str = "\nScript: " . $_SERVER["SCRIPT_FILENAME"] . "\nFile: " . __FILE__ . "\nFunction: "
. __FUNCTION__ . "() - line: " . __LINE__ . "\nERROR($curl_error_no) - $curl_error_str\nHTTP Response Code: $httpCode\nCURL INFO:\n$curl_info";
die("ERROR getting $url: $str");
}
else{
// strip out all the header stuff
$start = '<h3 id="introductions-all-around">';
$end = '<footer>';
$html = GetBetween($page,$start,$end );
//assume no error from GetBetween - TODO
// IMP - add in the encoding = UTF-8, otherwise DOMDocument returns nodevalues in ASCII even though all chars may not be valid
$html = '<?xml version="1.0" encoding="utf-8" ?><h3 id="introductions-all-around">' . $html;
}
// create DOM
$xml = new DOMDocument();
$xml->loadHTML($html);
foreach ( $xml->getElementsByTagName( 'a' ) as $link ) {
if ( $link->getAttribute( 'href' )){
if(isMaybeATopPick($link->getAttribute( 'href' ), $link->nodeValue ) == true){
// web page is encoded as UTF-8 but $link->nodeValue was coming back as ASCII
// until I added the xml enc in the html
// hence this faffing about
// echo mb_detect_encoding($link->nodeValue);
// echo " ";
// echo mb_detect_encoding($link->getAttribute( 'href' ));
// echo " ";
// echo($link->getAttribute( 'href' ) . " = ");
// print($link->nodeValue);
// echo "\n";
//$string = iconv('ASCII', 'UTF-8//IGNORE', $link->nodeValue);
//$string= preg_replace ('/([\x80-\xff])/se', "pack (\"C*\", (ord ($1) >> 6) | 0xc0, (ord ($1) & 0x3f) | 0x80)", $link->nodeValue);
//echo($link->getAttribute( 'href' ) . " = " . $link->nodeValue . "\n");
echo "* [$link->nodeValue](" . $link->getAttribute( 'href' ) . ")\n";
}
}
}
exit;
function isMaybeATopPick($url, $text){
$picks = array( "app", "mac", "itunes.apple.com", "OS X", "iOS", "iTunes", "iPhone", "iPad", "Apple" );
foreach ($picks as $needle) {
if( stripos($url, $needle) !== false || stripos($text, $needle) !== false){
if( andIsNotOneOfThese($url, $text) == true){
return true;
}
}
}
return false;
}
function andIsNotOneOfThese($url, $text){
$nonPicks = array( "alpha.app.net", "join.app.net", "minimalmac", "http://macsparky.com", "macdrifter.com" , "https://twitter.com", "Frakes", "MacStories", "5by5.tv/mpu", "www.theverge.com");
foreach ($nonPicks as $needle) {
if( stripos($text, $needle) !== false || stripos($url, $needle) !== false){
return false;
}
}
return true;
}
function GetBetween($content,$start,$end){
$r = explode($start, $content);
if (isset($r[1])){
$r = explode($end, $r[1]);
return $r[0];
}
return '';
}
function CURL($url, $retries = 3)
{
global $httpCode;
global $curl_error_str;
global $curl_error_no;
global $curl_info;
$timeout = 5;
$connectionTimeout = 2;
$curl = curl_init($url);
if (is_resource($curl) === true)
{
curl_setopt($curl, CURLOPT_FAILONERROR, true);
curl_setopt($curl, CURLOPT_TIMEOUT, $timeout);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, $connectionTimeout);
curl_setopt($curl, CURLOPT_NOSIGNAL, 1);
//curl_setopt($curl, CURLOPT_VERBOSE, true);
$result = false;
while (($result === false) && (--$retries > 0))
{
//echo "CURL retries = $retries\n";
$result = curl_exec($curl);
}
if(!$result){
$httpCode = curl_getinfo($curl, CURLINFO_HTTP_CODE);
$curl_info_temp = curl_getinfo($curl);
if(is_array($curl_info_temp) == TRUE){
$curl_info = var_export($curl_info_temp, 1);
}
}
// Check if any error occured
if(curl_errno($curl))
{
$curl_error_str = curl_error($curl);
$curl_error_no = curl_errno($curl);
//echo 'Curl error: ' . curl_error($curl);
//echo 'Curl error no: ' . curl_errno($curl);
}
curl_close($curl);
}
else{
$curl_error_str = "curl_init() failed for $url";
}
return $result;
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment