Last active
September 29, 2018 11:57
-
-
Save vijinho/67d1a07bce5bbf98d041ee07084d1a9d to your computer and use it in GitHub Desktop.
unshorten a URL or find the target of a shortened URL and return it if success or status code if no success using PHP
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/php | |
<?php | |
/** | |
* url_resolve.php - CLI script for resolving url | |
* | |
* @author Vijay Mahrra <[email protected]> | |
* @copyright (c) Copyright 2018 Vijay Mahrra | |
* @license GPLv3 (http://www.gnu.org/licenses/gpl-3.0.html) | |
*/ | |
//----------------------------------------------------------------------------- | |
// required commands check | |
$commands = get_commands([ | |
'curl' => 'https://curl.haxx.se', | |
'wget' => 'https://www.gnu.org/software/wget/' | |
]); | |
if (empty($commands)) { | |
exit; | |
} | |
//----------------------------------------------------------------------------- | |
// define command-line options | |
// see https://secure.php.net/manual/en/function.getopt.php | |
// : - required, :: - optional | |
$options = getopt("hvtdou:", | |
['help', 'verbose', 'test', 'debug', 'offline', 'url:']); | |
// help | |
if (empty($options) || array_key_exists('h', $options) || array_key_exists('help', | |
$options)) { | |
options: | |
echo join("\n", | |
[ | |
"Usage: php url_resolve.php", | |
"Resolve a URL (e.g. unshorten it)", | |
"(Specifying any other unknown argument options will be ignored.)\n", | |
"\t-h, --help Display this help and exit", | |
"\t-v, --verbose Run in verbose mode", | |
"\t-d, --debug Run in debug mode", | |
"\t-t, --test Run in test mode, show what would be done, no file changes", | |
"\t-o, --offline Do not go online!", | |
"\t-u, --url=http://... Resolve this ur.", | |
"\nExamples:", | |
"\n\tResolve URL in verbose and debug mode: php url_resolve.php -d -v -u http://www.urunu.com", | |
"\n\tResolve URL in offline mode (always return URL!): php url_resolve.php -o -u http://www.urunu.com" | |
]) . "\n"; | |
// goto jump here if there's a problem | |
errors: | |
if (!empty($errors)) { | |
echo "Error(s):\n\t- " . join("\n\t- ", $errors) . "\n"; | |
} | |
exit; | |
} | |
$do = []; | |
foreach ([ | |
'verbose' => ['v', 'verbose'], | |
'test' => ['t', 'test'], | |
'debug' => ['d', 'debug'], | |
'offline' => ['o', 'offline'] | |
] as $i => $opts) { | |
$do[$i] = (int) (array_key_exists($opts[0], $options) || array_key_exists($opts[1], | |
$options)); | |
} | |
//----------------------------------------------------------------------------- | |
// defines (int) - forces 0 or 1 value | |
define('DEBUG', (int) $do['debug']); | |
define('VERBOSE', (int) $do['verbose']); | |
define('TEST', (int) $do['test']); | |
define('OFFLINE', (int) $do['offline']); | |
debug("COMMANDS:", $commands); | |
verbose('Tasks to perform:', $do); | |
//----------------------------------------------------------------------------- | |
// url manipulation and handling variables | |
$url_shorteners = [// dereference & update URLs if moved or using shortener which is not twitters | |
'53eig.ht', 'aca.st', 'amzn.to', 'b-o-e.uk', 'b0x.ee', 'bankofeng.uk', | |
'bbc.in', 'bit.ly', | |
'bitly.com', 'bloom.bg', 'boe.uk', 'bru.gl', 'buff.ly', 'cnb.cx', 'cnnmon.ie', 'dailym.ai', | |
'deck.ly', 'dld.bz', | |
'dlvr.it', 'econ.st', 'eff.org', 'eurone.ws', 'fal.cn', 'fb.me', 'for.tn', 'go.nasa.gov', | |
'go.shr.lc', | |
'goo.gl', 'ht.ly', 'hubs.ly', 'huff.to', 'ind.pn', 'instagr.am', | |
'interc.pt', | |
'j.mp', 'jrnl.ie', 'jtim.es', 'kurl.nl', 'ln.is', | |
'n.mynews.ly', 'newsl.it', 'n.pr', | |
'nyp.st', 'nyti.ms', 'on.fb.me', 'on.ft.com', 'on.mktw.net', 'on.rt.com', 'on.wsj.com', | |
'ow.ly', 'owl.li', | |
'po.st', 'poal.me', 'ptv.io', 'read.bi', 'reut.rs', 'rviv.ly', 'sc.mp', 'scl.io', 'shr.gs', 'shar.es', | |
'socsi.in', 'spon.de', | |
'spoti.fi', 'spr.ly', 'sptnkne.ws', 'str.sg', 't.co', 'tgam.ca', 'ti.me', 'tinurl.us', | |
'tinyurl.com', | |
'tlsur.net', 'tmblr.co', 'tr.im', 'trib.al', 'tws.io', 'vrge.co', 'wapo.st', | |
'wef.ch', 'wp.me', | |
'wpo.st', 'wrd.cm', 'wrld.bg', 'www.goo.gl', 'xhne.ws', 'yhoo.it', 'youtu.be' | |
]; | |
//----------------------------------------------------------------------------- | |
// initialise variables | |
$errors = []; // errors to be output if a problem occurred | |
$output = []; // data to be output at the end | |
//----------------------------------------------------------------------------- | |
// MAIN resolve url | |
$url = ''; | |
if (!empty($options['u'])) { | |
$url = $options['u']; | |
} elseif (!empty($options['url'])) { | |
$url = $options['url']; | |
} | |
$parts = parse_url($url); | |
if (false === $parts || count($parts) <= 1 || empty($parts['scheme'])) { | |
$url = ''; | |
} else { | |
switch ($parts['scheme']) { | |
case 'http': | |
case 'https': | |
break; | |
default: | |
$url = ''; | |
} | |
} | |
if (empty($url)) { | |
$errors[] = "You must specify a valid URL!"; | |
goto errors; | |
} | |
verbose(sprintf("URL: %s", $url)); | |
$output = url_resolve($url); | |
//----------------------------------------------------------------------------- | |
// final output of data | |
output: | |
// display any errors | |
if (!empty($errors)) { | |
goto errors; | |
} | |
print_r($output); | |
echo "\n"; | |
exit; | |
//----------------------------------------------------------------------------- | |
// functions used above | |
/** | |
* Dump debug data if DEBUG constant is set | |
* | |
* @param optional string $string string to output | |
* @param optional mixed $data to dump | |
* @return boolean true if string output, false if not | |
*/ | |
function debug($string = '', $data = '') | |
{ | |
if (DEBUG) { | |
echo "[D] $string\n"; | |
if (!empty($data)) { | |
print_r($data); | |
} | |
return true; | |
} | |
return false; | |
} | |
/** | |
* Output string if VERBOSE constant is set | |
* | |
* @param string $string string to output | |
* @param optional mixed $data to dump | |
* @return boolean true if string output, false if not | |
*/ | |
function verbose($string, $data = '') | |
{ | |
if (VERBOSE && !empty($string)) { | |
echo "[V] $string\n"; | |
if (!empty($data)) { | |
print_r($data); | |
} | |
return true; | |
} | |
return false; | |
} | |
/** | |
* check required commands installed and get path | |
* | |
* @param array $requirements [][command -> description] | |
* @return mixed array [command -> path] or string errors | |
*/ | |
function get_commands($requirements = []) | |
{ | |
static $commands = []; // cli command paths | |
if (!empty($commands)) { | |
return $commands; | |
} | |
$errors = []; | |
foreach ($requirements as $tool => $description) { | |
$cmd = cmd_execute("which $tool"); | |
if (empty($cmd)) { | |
$errors[] = "Error: Missing requirement: $tool - " . $description; | |
} else { | |
$commands[$tool] = $cmd[0]; | |
} | |
} | |
if (!empty($errors)) { | |
echo join("\n", $errors) . "\n"; | |
} | |
return $commands; | |
} | |
/** | |
* Execute a command and return streams as an array of | |
* stdin, stdout, stderr | |
* | |
* @param string $cmd command to execute | |
* @return mixed array $streams | boolean false if failure | |
* @see https://secure.php.net/manual/en/function.proc-open.php | |
*/ | |
function shell_execute($cmd) | |
{ | |
$process = proc_open( | |
$cmd, | |
[ | |
['pipe', 'r'], | |
['pipe', 'w'], | |
['pipe', 'w'] | |
], $pipes | |
); | |
if (is_resource($process)) { | |
$streams = []; | |
foreach ($pipes as $p => $v) { | |
$streams[] = stream_get_contents($pipes[$p]); | |
} | |
proc_close($process); | |
return [ | |
'stdin' => $streams[0], | |
'stdout' => $streams[1], | |
'stderr' => $streams[2] | |
]; | |
} | |
return false; | |
} | |
/** | |
* Execute a command and return output of stdout or throw exception of stderr | |
* | |
* @param string $cmd command to execute | |
* @param boolean $split split returned results? default on newline | |
* @param string $exp regular expression to preg_split to split on | |
* @return mixed string $stdout | Exception if failure | |
* @see shell_execute($cmd) | |
*/ | |
function cmd_execute($cmd, $split = true, $exp = "/\n/") | |
{ | |
$result = shell_execute($cmd); | |
if (!empty($result['stderr'])) { | |
throw new Exception($result['stderr']); | |
} | |
$data = $result['stdout']; | |
if (empty($split) || empty($exp) || empty($data)) { | |
return $data; | |
} | |
return preg_split($exp, $data); | |
} | |
/** | |
* unshorten a URL/find the target of a URL | |
* | |
* @param string $url the url to url_resolve | |
* @param array $options options | |
* @return string|int actual string URL of destination url OR curl status code | |
* @see https://ec.haxx.se/usingcurl-returns.html | |
*/ | |
function url_resolve($url, $options = []) | |
{ | |
if (OFFLINE) { | |
return $url; | |
} | |
$commands = get_commands(); | |
$wget = $commands['wget']; | |
$curl = $commands['curl']; | |
// retry getting a url if the curl exit code is in this list | |
// https://ec.haxx.se/usingcurl-returns.html | |
// 6 - Couldn't resolve$ host | |
$cmds['curl']['retry_exit_codes'] = [ | |
4, 5, 16, 23, 26, 27, 33, 42, 43, | |
45, 48, 55, 59, 60, 61, 75, 76, 77, 78, 80 | |
]; | |
// return codes from curl (url_resolve() function below) which indiciate we should not try to resolve a url | |
// -22 signifies a wget failure, the rest are from curl | |
$cmds['curl']['dead_exit_codes'] = [3, 6, 7, 18, 28, 35, 47, 52, 56, -22]; | |
static $urls = []; // remember previous urls | |
static $i; | |
url_resolve_recheck: // re-check from here | |
if (array_key_exists($url, $urls)) { | |
if (!in_array($urls[$url], $cmds['curl']['retry_exit_codes'])) { | |
return $urls[$url]; | |
} else if (!in_array($urls[$url], $cmds['curl']['dead_exit_codes'])) { | |
return $urls[$url]; | |
} | |
unset($urls[$url]); | |
} | |
$i++; | |
$timeout = !empty($options['timeout']) ? (int) $options['timeout'] : 3; | |
$max_time = !empty($options['max_time']) ? (int) $options['max_time'] | |
: $timeout * 10; | |
$timeout = "--connect-timeout $timeout --max-time $max_time"; | |
$user_agent = ''; //'-A "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" '; | |
$curl_options = "$user_agent $timeout --ciphers ALL -k"; | |
$curl_url_resolve = "curl $curl_options -I -i -Ls -w %{url_effective} -o /dev/null " . escapeshellarg($url); | |
$output = []; | |
$target_url = exec($curl_url_resolve, $output, $status); | |
if ($status !== 0) { | |
if (!is_numeric($target_url) && $target_url !== $url) { | |
$url = $target_url; | |
goto url_resolve_recheck; | |
} | |
} | |
// same URl, loop! | |
if ($target_url == $url) { | |
$cmd_wget_spider = sprintf( | |
"$wget -t 2 -T 5 -v --spider %s", escapeshellarg($url) | |
); | |
// try wget instead | |
$output = shell_execute($cmd_wget_spider); | |
if (!empty($output) && is_array($output)) { | |
if (empty($output['stdin']) && empty($output['stdout']) && !empty($output['stderr'])) { | |
if (false !== stristr($output['stderr'], 'broken link')) { | |
return -22; | |
} else { | |
if (false !== stristr( | |
$output['stderr'], | |
'Remote file exists and could contain further links,' | |
) | |
) { | |
return $target_url; | |
} else if (preg_match_all( | |
'/(?P<url>http[s]?:\/\/[^\s]+[^\.\s]+)/i', | |
$output['stderr'], $matches | |
) | |
) { | |
// no URLs found | |
if (!empty($matches['url'])) { | |
foreach ($matches['url'] as $url) { | |
$found_urls[$url] = $url; | |
} | |
} | |
if (!empty($found_urls)) { | |
if ($target_url !== $url) { | |
$target_url = array_pop($found_urls); | |
$url = $target_url; | |
goto url_resolve_recheck; | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
if ($status === 0 || ($status == 6 && !is_numeric($target_url)) && !empty($target_url)) { | |
$curl_http_status = "$curl $curl_options -s -o /dev/null -w %{http_code} " . escapeshellarg($target_url); | |
$output = []; | |
$http_status = exec($curl_http_status, $output, $status); | |
} | |
$return = ($status === 0) ? $target_url : $status; | |
$urls[$url] = $return; // cache in static var | |
return $return; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment