Created
October 31, 2014 21:05
-
-
Save scumola/ba84bfc67e4b1631d57b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/php | |
<?php | |
$link = mysql_connect('10.0.0.13', 'xxxxx', 'xxxxx'); | |
if (!$link) { | |
die('Could not connect to DB: ' . mysql_error()); | |
} | |
echo 'Connected to DB successfully'; | |
$db_selected = mysql_select_db('tweets', $link); | |
if (!$db_selected) { | |
die ('Can\'t use database tweets : ' . mysql_error()); | |
} | |
function check_db_first ($original) { | |
global $link; | |
$result = mysql_query("select expanded from urls3 where original = '$original'", $link); | |
if (!$result) { | |
echo "************************* MySQL Error: " . mysql_error() . "\n"; | |
} | |
$num_rows = mysql_num_rows($result); | |
if ($num_rows > 0) { | |
return true; | |
} else { | |
return false; | |
} | |
} | |
function insert_url ($original, $expanded) { | |
global $link; | |
if (check_db_first($original)) { | |
return true; | |
} | |
if ((strlen($original) < 255) && (!strncmp(strtolower($expanded),"http",4)) && (strcmp($original,$expanded))){ | |
$result = mysql_query("insert into urls3 values (now(),'$original','$expanded')", $link); | |
if (!$result) { | |
echo "************************* MySQL Error (on insert): " . mysql_error() . "\n"; | |
} | |
} | |
} | |
function get_final_url( $url, $timeout = 5 ) | |
{ | |
$url = str_replace( "&", "&", urldecode(trim($url)) ); | |
if (check_db_first($url)) { | |
print ("Already in DB -> not crawling ($url)\n"); | |
return true; | |
} | |
# $cookie = tempnam ("/tmp", "CURLCOOKIE"); | |
$ch = curl_init(); | |
curl_setopt( $ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1" ); | |
curl_setopt( $ch, CURLOPT_URL, $url ); | |
# curl_setopt( $ch, CURLOPT_COOKIEJAR, $cookie ); | |
curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, false ); | |
curl_setopt( $ch, CURLOPT_ENCODING, "" ); | |
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true ); | |
curl_setopt( $ch, CURLOPT_AUTOREFERER, true ); | |
curl_setopt( $ch, CURLOPT_NOBODY, true ); | |
curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, $timeout ); | |
curl_setopt( $ch, CURLOPT_TIMEOUT, $timeout ); | |
curl_setopt( $ch, CURLOPT_MAXREDIRS, 1 ); | |
# curl_setopt( $ch, CURLOPT_PROXY, "http://10.0.0.5:3128"); | |
$content = curl_exec( $ch ); | |
$response = curl_getinfo( $ch ); | |
curl_close ( $ch ); | |
# unlink ($cookie); | |
if ($response['http_code'] == 301 || $response['http_code'] == 302) { | |
ini_set("user_agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1"); | |
$headers = get_headers($response['url']); | |
if (!isset($headers)) { | |
insert_url($url,'NONE'); | |
return true; | |
} | |
$location = ""; | |
foreach( $headers as $value ) { | |
if ( substr( strtolower($value), 0, 9 ) == "location:" ) { | |
$next_url = trim( substr( $value, 9, strlen($value) ) ); | |
insert_url($url,$next_url); | |
print ("RE-CRAWLED (301 or 302): $url -> $next_url\n"); | |
return get_final_url( $next_url ); | |
} | |
} | |
} elseif ($response['http_code'] == 502 || $response['http_code'] == 503) { | |
insert_url($url,'NONE'); | |
} elseif ($response['http_code'] == 404 || $response['http_code'] == 403) { | |
insert_url($url,'NONE'); | |
} | |
if ( preg_match("/window\.location\.replace\('(.*)'\)/i", $content, $value) || | |
preg_match("/window\.location\=\"(.*)\"/i", $content, $value) | |
) { | |
insert_url($url,$value[1]); | |
print ("RE-CRAWLED (Javascript): $url -> $value[1]\n"); | |
return get_final_url ( $value[1] ); | |
} else { | |
$next_url = $response['url']; | |
if (strcmp($url,$next_url)) { | |
insert_url($url,$next_url); | |
print ("CRAWLED: $url -> $next_url\n"); | |
} | |
return $response['url']; | |
} | |
} | |
$handle = fopen("php://stdin", 'r+'); | |
while (($buffer = fgets($handle, 64000)) !== false) { | |
unset($tweet); | |
$obj = json_decode($buffer); | |
if (isset($obj->{'delete'})) { | |
$delete = $obj->{'delete'}; | |
} else { | |
unset($delete); | |
} | |
if (isset($obj->{'created_at'})) { | |
$timestring = $obj->{'created_at'}; | |
$tweet = $obj->{'text'}; | |
$tweet_id = $obj->{'id_str'}; | |
$user_id = $obj->{'user'}->{'screen_name'}; | |
$urls = $obj->{'entities'}->{'urls'}; | |
} else { | |
unset($timestring); | |
} | |
if (isset($timestring)) { | |
$urlstrings = ""; | |
foreach ($urls as &$item) { | |
$original_url = $item->{'url'}; | |
$expanded_url = $item->{'expanded_url'}; | |
print ("\nORIG: $original_url -> $expanded_url\n"); | |
if (strlen($expanded_url) > 0) { | |
insert_url($original_url,$expanded_url); | |
$final_url = get_final_url($expanded_url); | |
} | |
# print ("CRAWLED: $expanded_url -> $final_url\n"); | |
} | |
} | |
} | |
fclose($handle); | |
mysql_close($link); | |
print ("E X I T\n"); | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment