Skip to content

Instantly share code, notes, and snippets.

@scumola
Created October 31, 2014 21:05
Show Gist options
  • Save scumola/ba84bfc67e4b1631d57b to your computer and use it in GitHub Desktop.
Save scumola/ba84bfc67e4b1631d57b to your computer and use it in GitHub Desktop.
#!/usr/bin/php
<?php
$link = mysql_connect('10.0.0.13', 'xxxxx', 'xxxxx');
if (!$link) {
die('Could not connect to DB: ' . mysql_error());
}
echo 'Connected to DB successfully';
$db_selected = mysql_select_db('tweets', $link);
if (!$db_selected) {
die ('Can\'t use database tweets : ' . mysql_error());
}
function check_db_first ($original) {
global $link;
$result = mysql_query("select expanded from urls3 where original = '$original'", $link);
if (!$result) {
echo "************************* MySQL Error: " . mysql_error() . "\n";
}
$num_rows = mysql_num_rows($result);
if ($num_rows > 0) {
return true;
} else {
return false;
}
}
function insert_url ($original, $expanded) {
global $link;
if (check_db_first($original)) {
return true;
}
if ((strlen($original) < 255) && (!strncmp(strtolower($expanded),"http",4)) && (strcmp($original,$expanded))){
$result = mysql_query("insert into urls3 values (now(),'$original','$expanded')", $link);
if (!$result) {
echo "************************* MySQL Error (on insert): " . mysql_error() . "\n";
}
}
}
function get_final_url( $url, $timeout = 5 )
{
$url = str_replace( "&amp;", "&", urldecode(trim($url)) );
if (check_db_first($url)) {
print ("Already in DB -> not crawling ($url)\n");
return true;
}
# $cookie = tempnam ("/tmp", "CURLCOOKIE");
$ch = curl_init();
curl_setopt( $ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1" );
curl_setopt( $ch, CURLOPT_URL, $url );
# curl_setopt( $ch, CURLOPT_COOKIEJAR, $cookie );
curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, false );
curl_setopt( $ch, CURLOPT_ENCODING, "" );
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
curl_setopt( $ch, CURLOPT_AUTOREFERER, true );
curl_setopt( $ch, CURLOPT_NOBODY, true );
curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, $timeout );
curl_setopt( $ch, CURLOPT_TIMEOUT, $timeout );
curl_setopt( $ch, CURLOPT_MAXREDIRS, 1 );
# curl_setopt( $ch, CURLOPT_PROXY, "http://10.0.0.5:3128");
$content = curl_exec( $ch );
$response = curl_getinfo( $ch );
curl_close ( $ch );
# unlink ($cookie);
if ($response['http_code'] == 301 || $response['http_code'] == 302) {
ini_set("user_agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1");
$headers = get_headers($response['url']);
if (!isset($headers)) {
insert_url($url,'NONE');
return true;
}
$location = "";
foreach( $headers as $value ) {
if ( substr( strtolower($value), 0, 9 ) == "location:" ) {
$next_url = trim( substr( $value, 9, strlen($value) ) );
insert_url($url,$next_url);
print ("RE-CRAWLED (301 or 302): $url -> $next_url\n");
return get_final_url( $next_url );
}
}
} elseif ($response['http_code'] == 502 || $response['http_code'] == 503) {
insert_url($url,'NONE');
} elseif ($response['http_code'] == 404 || $response['http_code'] == 403) {
insert_url($url,'NONE');
}
if ( preg_match("/window\.location\.replace\('(.*)'\)/i", $content, $value) ||
preg_match("/window\.location\=\"(.*)\"/i", $content, $value)
) {
insert_url($url,$value[1]);
print ("RE-CRAWLED (Javascript): $url -> $value[1]\n");
return get_final_url ( $value[1] );
} else {
$next_url = $response['url'];
if (strcmp($url,$next_url)) {
insert_url($url,$next_url);
print ("CRAWLED: $url -> $next_url\n");
}
return $response['url'];
}
}
$handle = fopen("php://stdin", 'r+');
while (($buffer = fgets($handle, 64000)) !== false) {
unset($tweet);
$obj = json_decode($buffer);
if (isset($obj->{'delete'})) {
$delete = $obj->{'delete'};
} else {
unset($delete);
}
if (isset($obj->{'created_at'})) {
$timestring = $obj->{'created_at'};
$tweet = $obj->{'text'};
$tweet_id = $obj->{'id_str'};
$user_id = $obj->{'user'}->{'screen_name'};
$urls = $obj->{'entities'}->{'urls'};
} else {
unset($timestring);
}
if (isset($timestring)) {
$urlstrings = "";
foreach ($urls as &$item) {
$original_url = $item->{'url'};
$expanded_url = $item->{'expanded_url'};
print ("\nORIG: $original_url -> $expanded_url\n");
if (strlen($expanded_url) > 0) {
insert_url($original_url,$expanded_url);
$final_url = get_final_url($expanded_url);
}
# print ("CRAWLED: $expanded_url -> $final_url\n");
}
}
}
fclose($handle);
mysql_close($link);
print ("E X I T\n");
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment