Skip to content

Instantly share code, notes, and snippets.

@vijinho
Last active September 29, 2018 11:57
Show Gist options
  • Save vijinho/67d1a07bce5bbf98d041ee07084d1a9d to your computer and use it in GitHub Desktop.
Save vijinho/67d1a07bce5bbf98d041ee07084d1a9d to your computer and use it in GitHub Desktop.
unshorten a URL or find the target of a shortened URL and return it if success or status code if no success using PHP
#!/usr/bin/php
<?php
/**
* url_resolve.php - CLI script for resolving url
*
* @author Vijay Mahrra <[email protected]>
* @copyright (c) Copyright 2018 Vijay Mahrra
* @license GPLv3 (http://www.gnu.org/licenses/gpl-3.0.html)
*/
//-----------------------------------------------------------------------------
// required commands check
$commands = get_commands([
'curl' => 'https://curl.haxx.se',
'wget' => 'https://www.gnu.org/software/wget/'
]);
if (empty($commands)) {
exit;
}
//-----------------------------------------------------------------------------
// define command-line options
// see https://secure.php.net/manual/en/function.getopt.php
// : - required, :: - optional
$options = getopt("hvtdou:",
['help', 'verbose', 'test', 'debug', 'offline', 'url:']);
// help
if (empty($options) || array_key_exists('h', $options) || array_key_exists('help',
$options)) {
options:
echo join("\n",
[
"Usage: php url_resolve.php",
"Resolve a URL (e.g. unshorten it)",
"(Specifying any other unknown argument options will be ignored.)\n",
"\t-h, --help Display this help and exit",
"\t-v, --verbose Run in verbose mode",
"\t-d, --debug Run in debug mode",
"\t-t, --test Run in test mode, show what would be done, no file changes",
"\t-o, --offline Do not go online!",
"\t-u, --url=http://... Resolve this ur.",
"\nExamples:",
"\n\tResolve URL in verbose and debug mode: php url_resolve.php -d -v -u http://www.urunu.com",
"\n\tResolve URL in offline mode (always return URL!): php url_resolve.php -o -u http://www.urunu.com"
]) . "\n";
// goto jump here if there's a problem
errors:
if (!empty($errors)) {
echo "Error(s):\n\t- " . join("\n\t- ", $errors) . "\n";
}
exit;
}
$do = [];
foreach ([
'verbose' => ['v', 'verbose'],
'test' => ['t', 'test'],
'debug' => ['d', 'debug'],
'offline' => ['o', 'offline']
] as $i => $opts) {
$do[$i] = (int) (array_key_exists($opts[0], $options) || array_key_exists($opts[1],
$options));
}
//-----------------------------------------------------------------------------
// defines (int) - forces 0 or 1 value
define('DEBUG', (int) $do['debug']);
define('VERBOSE', (int) $do['verbose']);
define('TEST', (int) $do['test']);
define('OFFLINE', (int) $do['offline']);
debug("COMMANDS:", $commands);
verbose('Tasks to perform:', $do);
//-----------------------------------------------------------------------------
// url manipulation and handling variables
$url_shorteners = [// dereference & update URLs if moved or using shortener which is not twitters
'53eig.ht', 'aca.st', 'amzn.to', 'b-o-e.uk', 'b0x.ee', 'bankofeng.uk',
'bbc.in', 'bit.ly',
'bitly.com', 'bloom.bg', 'boe.uk', 'bru.gl', 'buff.ly', 'cnb.cx', 'cnnmon.ie', 'dailym.ai',
'deck.ly', 'dld.bz',
'dlvr.it', 'econ.st', 'eff.org', 'eurone.ws', 'fal.cn', 'fb.me', 'for.tn', 'go.nasa.gov',
'go.shr.lc',
'goo.gl', 'ht.ly', 'hubs.ly', 'huff.to', 'ind.pn', 'instagr.am',
'interc.pt',
'j.mp', 'jrnl.ie', 'jtim.es', 'kurl.nl', 'ln.is',
'n.mynews.ly', 'newsl.it', 'n.pr',
'nyp.st', 'nyti.ms', 'on.fb.me', 'on.ft.com', 'on.mktw.net', 'on.rt.com', 'on.wsj.com',
'ow.ly', 'owl.li',
'po.st', 'poal.me', 'ptv.io', 'read.bi', 'reut.rs', 'rviv.ly', 'sc.mp', 'scl.io', 'shr.gs', 'shar.es',
'socsi.in', 'spon.de',
'spoti.fi', 'spr.ly', 'sptnkne.ws', 'str.sg', 't.co', 'tgam.ca', 'ti.me', 'tinurl.us',
'tinyurl.com',
'tlsur.net', 'tmblr.co', 'tr.im', 'trib.al', 'tws.io', 'vrge.co', 'wapo.st',
'wef.ch', 'wp.me',
'wpo.st', 'wrd.cm', 'wrld.bg', 'www.goo.gl', 'xhne.ws', 'yhoo.it', 'youtu.be'
];
//-----------------------------------------------------------------------------
// initialise variables
$errors = []; // errors to be output if a problem occurred
$output = []; // data to be output at the end
//-----------------------------------------------------------------------------
// MAIN resolve url
$url = '';
if (!empty($options['u'])) {
$url = $options['u'];
} elseif (!empty($options['url'])) {
$url = $options['url'];
}
$parts = parse_url($url);
if (false === $parts || count($parts) <= 1 || empty($parts['scheme'])) {
$url = '';
} else {
switch ($parts['scheme']) {
case 'http':
case 'https':
break;
default:
$url = '';
}
}
if (empty($url)) {
$errors[] = "You must specify a valid URL!";
goto errors;
}
verbose(sprintf("URL: %s", $url));
$output = url_resolve($url);
//-----------------------------------------------------------------------------
// final output of data
output:
// display any errors
if (!empty($errors)) {
goto errors;
}
print_r($output);
echo "\n";
exit;
//-----------------------------------------------------------------------------
// functions used above
/**
* Dump debug data if DEBUG constant is set
*
* @param optional string $string string to output
* @param optional mixed $data to dump
* @return boolean true if string output, false if not
*/
function debug($string = '', $data = '')
{
if (DEBUG) {
echo "[D] $string\n";
if (!empty($data)) {
print_r($data);
}
return true;
}
return false;
}
/**
* Output string if VERBOSE constant is set
*
* @param string $string string to output
* @param optional mixed $data to dump
* @return boolean true if string output, false if not
*/
function verbose($string, $data = '')
{
if (VERBOSE && !empty($string)) {
echo "[V] $string\n";
if (!empty($data)) {
print_r($data);
}
return true;
}
return false;
}
/**
* check required commands installed and get path
*
* @param array $requirements [][command -> description]
* @return mixed array [command -> path] or string errors
*/
function get_commands($requirements = [])
{
static $commands = []; // cli command paths
if (!empty($commands)) {
return $commands;
}
$errors = [];
foreach ($requirements as $tool => $description) {
$cmd = cmd_execute("which $tool");
if (empty($cmd)) {
$errors[] = "Error: Missing requirement: $tool - " . $description;
} else {
$commands[$tool] = $cmd[0];
}
}
if (!empty($errors)) {
echo join("\n", $errors) . "\n";
}
return $commands;
}
/**
* Execute a command and return streams as an array of
* stdin, stdout, stderr
*
* @param string $cmd command to execute
* @return mixed array $streams | boolean false if failure
* @see https://secure.php.net/manual/en/function.proc-open.php
*/
function shell_execute($cmd)
{
$process = proc_open(
$cmd,
[
['pipe', 'r'],
['pipe', 'w'],
['pipe', 'w']
], $pipes
);
if (is_resource($process)) {
$streams = [];
foreach ($pipes as $p => $v) {
$streams[] = stream_get_contents($pipes[$p]);
}
proc_close($process);
return [
'stdin' => $streams[0],
'stdout' => $streams[1],
'stderr' => $streams[2]
];
}
return false;
}
/**
* Execute a command and return output of stdout or throw exception of stderr
*
* @param string $cmd command to execute
* @param boolean $split split returned results? default on newline
* @param string $exp regular expression to preg_split to split on
* @return mixed string $stdout | Exception if failure
* @see shell_execute($cmd)
*/
function cmd_execute($cmd, $split = true, $exp = "/\n/")
{
$result = shell_execute($cmd);
if (!empty($result['stderr'])) {
throw new Exception($result['stderr']);
}
$data = $result['stdout'];
if (empty($split) || empty($exp) || empty($data)) {
return $data;
}
return preg_split($exp, $data);
}
/**
* unshorten a URL/find the target of a URL
*
* @param string $url the url to url_resolve
* @param array $options options
* @return string|int actual string URL of destination url OR curl status code
* @see https://ec.haxx.se/usingcurl-returns.html
*/
function url_resolve($url, $options = [])
{
if (OFFLINE) {
return $url;
}
$commands = get_commands();
$wget = $commands['wget'];
$curl = $commands['curl'];
// retry getting a url if the curl exit code is in this list
// https://ec.haxx.se/usingcurl-returns.html
// 6 - Couldn't resolve$ host
$cmds['curl']['retry_exit_codes'] = [
4, 5, 16, 23, 26, 27, 33, 42, 43,
45, 48, 55, 59, 60, 61, 75, 76, 77, 78, 80
];
// return codes from curl (url_resolve() function below) which indiciate we should not try to resolve a url
// -22 signifies a wget failure, the rest are from curl
$cmds['curl']['dead_exit_codes'] = [3, 6, 7, 18, 28, 35, 47, 52, 56, -22];
static $urls = []; // remember previous urls
static $i;
url_resolve_recheck: // re-check from here
if (array_key_exists($url, $urls)) {
if (!in_array($urls[$url], $cmds['curl']['retry_exit_codes'])) {
return $urls[$url];
} else if (!in_array($urls[$url], $cmds['curl']['dead_exit_codes'])) {
return $urls[$url];
}
unset($urls[$url]);
}
$i++;
$timeout = !empty($options['timeout']) ? (int) $options['timeout'] : 3;
$max_time = !empty($options['max_time']) ? (int) $options['max_time']
: $timeout * 10;
$timeout = "--connect-timeout $timeout --max-time $max_time";
$user_agent = ''; //'-A "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" ';
$curl_options = "$user_agent $timeout --ciphers ALL -k";
$curl_url_resolve = "curl $curl_options -I -i -Ls -w %{url_effective} -o /dev/null " . escapeshellarg($url);
$output = [];
$target_url = exec($curl_url_resolve, $output, $status);
if ($status !== 0) {
if (!is_numeric($target_url) && $target_url !== $url) {
$url = $target_url;
goto url_resolve_recheck;
}
}
// same URl, loop!
if ($target_url == $url) {
$cmd_wget_spider = sprintf(
"$wget -t 2 -T 5 -v --spider %s", escapeshellarg($url)
);
// try wget instead
$output = shell_execute($cmd_wget_spider);
if (!empty($output) && is_array($output)) {
if (empty($output['stdin']) && empty($output['stdout']) && !empty($output['stderr'])) {
if (false !== stristr($output['stderr'], 'broken link')) {
return -22;
} else {
if (false !== stristr(
$output['stderr'],
'Remote file exists and could contain further links,'
)
) {
return $target_url;
} else if (preg_match_all(
'/(?P<url>http[s]?:\/\/[^\s]+[^\.\s]+)/i',
$output['stderr'], $matches
)
) {
// no URLs found
if (!empty($matches['url'])) {
foreach ($matches['url'] as $url) {
$found_urls[$url] = $url;
}
}
if (!empty($found_urls)) {
if ($target_url !== $url) {
$target_url = array_pop($found_urls);
$url = $target_url;
goto url_resolve_recheck;
}
}
}
}
}
}
}
if ($status === 0 || ($status == 6 && !is_numeric($target_url)) && !empty($target_url)) {
$curl_http_status = "$curl $curl_options -s -o /dev/null -w %{http_code} " . escapeshellarg($target_url);
$output = [];
$http_status = exec($curl_http_status, $output, $status);
}
$return = ($status === 0) ? $target_url : $status;
$urls[$url] = $return; // cache in static var
return $return;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment