Created
March 13, 2012 23:35
-
-
Save ninnypants/2032680 to your computer and use it in GitHub Desktop.
Proxy that adjusts links
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
session_start(); | |
function build_url($args = array()){ | |
$defaults = array( | |
'scheme' => 'http', | |
'host' => '', | |
'path' => '', | |
'query' => '', | |
'user' => '', | |
'pass' => '', | |
'port' => '', | |
'fragment' => '' | |
); | |
$args = array_merge($defaults, $args); | |
$url = $args['scheme'].'://'; | |
if(!empty($args['user'])){ | |
$url .= $args['user']; | |
if(!empty($args['pass'])){ | |
$url .= ':'.$args['pass']; | |
} | |
$url .= '@'; | |
} | |
$url .= $args['host'].$args['port'].$args['path']; | |
if(!empty($args['query'])){ | |
$url .= '?'.$args['query']; | |
} | |
$url .= $args['fragment']; | |
return $url; | |
} | |
function single_asset_replace($match, $content){ | |
global $cur_dir, $base, $cur; | |
// find all instances of the asset | |
preg_match_all($match, $content, $matches); | |
$find = $matches[0]; | |
$replace = array(); | |
$cnt = count($matches[0]); | |
// do replaces based on the different types of relative urls | |
for($i = 0; $i < $cnt; $i++){ | |
$na = trim($matches[2][$i]); | |
if(preg_match('#^\.#', $na)){ | |
$na = $cur_dir.ltrim($na, '.'); | |
}elseif(preg_match('#^/#', $na)){ | |
$na = $base.$na; | |
}elseif(preg_match('#^\?#', $na)){ | |
$na = $cur.$na; | |
}elseif(preg_match('#^\w#i', $na) && !preg_match('#^http(|s)://#i', $na)){ | |
$na = $cur_dir.$na; | |
} | |
$replace[$i] = str_replace($matches[2][$i], $na, $matches[0][$i]); | |
} | |
//var_dump($find, $replace); | |
// replace throughout the content | |
return str_replace($find, $replace, $content); | |
} | |
function link_href_replace($content){ | |
global $cur_dir, $base, $cur; | |
// find all links in the content | |
preg_match_all('#<a.*href\s*=\s*("|\')([^"\']+|)("|\')#Ui', $content, $href_matches); | |
//var_dump($href_matches); | |
$find = $href_matches[0]; | |
$replace = array(); | |
$cnt = count($href_matches[0]); | |
// do replaces based on the different types of relative urls | |
for($i = 0; $i < $cnt; $i++){ | |
$nh = trim($href_matches[2][$i]); | |
if(preg_match('#^\.#', $nh)){ | |
$nh = $cur_dir.ltrim($nh, '.'); | |
}elseif(preg_match('#^/#', $nh)){ | |
$nh = $base.$nh; | |
}elseif(preg_match('#^\?#', $nh)){ | |
$nh = $cur.$nh; | |
}elseif(preg_match('#^\w#i', $nh) && !preg_match('#^http(|s)://#i', $nh)){ | |
$nh = $cur_dir.$nh; | |
} | |
// replace http to work with tagperfect servers | |
$nh = preg_replace('#^http#', '', $nh); | |
if(empty($href_matches[2][$i]) && preg_match('#\s+#', $href_matches[2][$i])){ | |
$replace[$i] = $href_macthes[0][$i]; | |
}else{ | |
$replace[$i] = str_replace($href_matches[2][$i], '?url='.urlencode($nh), $href_matches[0][$i]); | |
} | |
} | |
//var_dump($replace); | |
// return updated content | |
return str_replace($find, $replace, $content); | |
} | |
function get_site_cookies(){ | |
global $urlp; | |
$ret = array(); | |
$host_pieces = explode('.', $urlp['host']); | |
// get tld before you start looping | |
$host = array_pop($host_pieces); | |
// loop through the cookies stored in the session and find all | |
// cookies that can be sent | |
foreach($host_pieces as $piece){ | |
$host = array_pop($host_pieces).'.'.$host; | |
#$_SESSION['cookies'] = array(); | |
#var_dump($_SESSION['cookies']); exit; | |
if(!empty($_SESSION['cookies'])) { | |
foreach($_SESSION['cookies'] as $cookie){ | |
if(preg_match('#domain=(\.|)'.$host.'#', $cookie)){ | |
$ret[] = $cookie; | |
} | |
} | |
} | |
} | |
return $ret; | |
} | |
// append a domain to cookies that come through without "domain set" | |
function cookie_domain($cookie){ | |
global $urlp; | |
$host_pieces = explode('.', $urlp['host']); | |
$domain = array_pop($host_pieces); | |
$domain = array_pop($host_pieces).'.'.$domain; | |
if(strpos('domain=', $cookie) === false){ | |
return $cookie.'; domain=.'.$domain; | |
} | |
return $cookie; | |
} | |
if(!isset($_GET['url']) || empty($_GET['url'])){ | |
?> | |
<!DOCTYPE html> | |
<html> | |
<head> | |
</head> | |
<body> | |
<form method="get" action=""> | |
<input type="text" name="url" /> | |
<input type="submit" value="Go" /> | |
</form> | |
</body> | |
</html> | |
<?php | |
exit; | |
} | |
// location used for the cookie files | |
$loc = dirname(__FILE__); | |
// make sure htmlentities didn't slip into the url | |
$url = str_replace('&', '&', trim($_GET['url'])); | |
// make sure the url starts with at least http:// | |
if(!preg_match('#^(|s)://#i', $url)){ | |
$url = 'http://'.$url; | |
}else{ | |
$url = 'http'.$url; | |
} | |
// parse the url so it can be rebuilt for certain relative link cases | |
$urlp = parse_url($url); | |
// build a url for the current url path no query string | |
$cur = build_url(array( | |
'scheme' => $urlp['scheme'], | |
'host' => $urlp['host'], | |
'path' => $urlp['path'] | |
)); | |
// fix dir issues when referencing the current directory | |
// dir_name will remove the last directory name from the path | |
// if the path does not have a file name at the end | |
if(strlen(strrchr($urlp['path'], '/')) === 1){ | |
$cdir_path = $urlp['path']; | |
}else{ | |
$cdir_path = dirname($urlp['path']); | |
} | |
// fix windows directory seporator | |
$cdir_path = str_replace('\\', '/', $cdir_path); | |
$cur_dir = build_url(array( | |
'scheme' => $urlp['scheme'], | |
'host' => $urlp['host'], | |
'path' => $cdir_path | |
)); | |
// base site url for use with links that start with / | |
$base = build_url(array( | |
'scheme' => $urlp['scheme'], | |
'host' => $urlp['host'] | |
)); | |
// store method type if it's a form submission | |
$method = isset($_GET['method']) ? strtolower($_GET['method']) : ''; | |
// if it's a get request build an array out of everything that | |
// is sent through $url and $_POST | |
if($method == 'get'){ | |
if(isset($urlp['query'])){ | |
$query_vars = parse_str($urlp['query']); | |
$query_vars = array_merge($query_vars, $_POST); | |
}else{ | |
$query_vars = $_POST; | |
} | |
$urlp['query'] = $query_vars; | |
$url = build_url($urlp); | |
} | |
$ch = curl_init($url); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
curl_setopt($ch, CURLOPT_AUTOREFERER, true); | |
// curl_setopt($ch, CURLOPT_COOKIEFILE, $loc.'/cookie.txt'); | |
// curl_setopt($ch, CURLOPT_COOKIEJAR, $loc.'/cookie.txt'); | |
curl_setopt($ch, CURLOPT_COOKIE, implode('; ', get_site_cookies())); | |
curl_setopt($ch, CURLOPT_HEADER, true); | |
curl_setopt($ch, CURLOPT_HTTPHEADER, array('Expect:')); | |
curl_setopt($ch, CURLINFO_HEADER_OUT, true); | |
// if the form submission method was post send the $_POST variable along | |
if($method == 'post'){ | |
curl_setopt($ch, CURLOPT_POST, true); | |
curl_setopt($ch, CURLOPT_POSTFIELDS, $_POST); | |
} | |
// turn off peer verification for https to avoid verification issues | |
if($urlp['scheme']=='https'){ | |
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); | |
} | |
$content = curl_exec($ch); | |
curl_close($ch); | |
// process the content | |
// extract cookies | |
// store cookies | |
preg_match_all('#Set-Cookie: (.*)#', $content, $cookiematch); | |
foreach($cookiematch[1] as $cookie){ | |
$_SESSION['cookies'][] = cookie_domain($cookie); | |
} | |
// echo '<pre>'; | |
// echo $content; | |
// remove headers from the content | |
$content = preg_replace('#^(HTTP.*?(\r?\n){2})+#is', '', $content); | |
// modify all hrefs so that they go through the proxy | |
$content = link_href_replace($content); | |
// asset handling | |
$content = single_asset_replace('#<img.*src\s*=\s*("|\')([^"\']+|)("|\')#Ui', $content); | |
// echo htmlentities($content); | |
// echo '##############################################################'; | |
$content = single_asset_replace('#<script.*src\s*=\s*("|\')([^"\']+|)("|\')#Ui', $content); | |
// echo htmlentities($content); | |
// echo '##############################################################'; | |
$content = single_asset_replace('#<link.*href\s*=\s*("|\')([^"\']+|)("|\')#Ui', $content); | |
// echo htmlentities($content); | |
// echo '##############################################################'; | |
// make sure the meta refreshes go through our script | |
preg_match_all('#<meta(.*http-equiv\s*=\s*"refresh".*content\s*=\s*"\d;url=([^"]+)"|.*content\s*=\s*"\d;url=([^"]+)".*http-equiv\s*=\s*"refresh")#Ui', $content, $refresh_matches); | |
$find = $refresh_matches[0]; | |
$replace = array(); | |
$cnt = count($refresh_matches[0]); | |
for($i = 0; $i < $cnt; $i++){ | |
// set keys to be used for matching | |
// order action method | |
if(preg_match('#http-equiv\s*=\s*"refresh".*content\s*=\s*"\d;url=([^"]+)"#i', $refresh_matches[1][$i])){ | |
$key = 2; | |
}else{ | |
$key = 3; | |
} | |
$nu = $refresh_matches[0][$i]; | |
// modify the refreshs action to use our script | |
$uri = $refresh_matches[$key][$i]; | |
if(preg_match('#^\.#', $uri)){ | |
$action = $cur_dir.ltrim($uri, '.'); | |
}elseif(preg_match('#^/#', $uri)){ | |
$action = $base.$action; | |
}elseif(preg_match('#^\?#', $uri)){ | |
$uri = $cur.$uri; | |
} | |
$uri = urlencode($uri); | |
// build action query sting | |
$uri = '?url='.$uri; | |
// replace action | |
$nu = str_replace($refresh_matches[$key][$i], $uri, $nu); | |
$replace[$i] = $nu; | |
} | |
$content = str_replace($find, $replace, $content); | |
// send all forms through post then send the values to the remote site | |
preg_match_all('#<form(.*action\s*=\s*"([^"]*)".*method\s*=\s*"([^"]+)"|.*method\s*=\s*"([^"]+)".*action\s*=\s*"([^"]*)")#Ui', $content, $form_matches); | |
$find = $form_matches[0]; | |
$replace = array(); | |
$cnt = count($form_matches[0]); | |
for($i = 0; $i < $cnt; $i++){ | |
// set keys to be used for matching | |
// order action method | |
if(preg_match('#.*action\s*=\s*"([^"]*)".*method\s*=\s*"([^"]+)"#i', $form_matches[1][$i])){ | |
$keys = array(2, 3); | |
}else{ | |
$keys = array(5, 4); | |
} | |
$nf = $form_matches[0][$i]; | |
// modify the forms action to use our script | |
$action = $form_matches[$keys[0]][$i]; | |
if(preg_match('#^\.#', $action)){ | |
$action = $cur_dir.ltrim($action, '.'); | |
}elseif(preg_match('#^/#', $action)){ | |
$action = $base.$action; | |
}elseif(preg_match('#^\?#', $action)){ | |
$action = $cur.$action; | |
}else{ | |
$action = $cur; | |
} | |
$action = urlencode($action); | |
// build action query sting | |
$action = '?url='.$action.'&method='.$form_matches[$keys[1]][$i]; | |
// replace action | |
if(empty($form_matches[$keys[0]][$i])){ | |
$nf = str_replace('action=""', 'action="'.$action.'"', $nf); | |
}else{ | |
$nf = str_replace($form_matches[$keys[0]][$i], $action, $nf); | |
} | |
if(strtolower($form_matches[$keys[1]][$i]) != 'post'){ | |
$nf = str_replace($form_matches[$keys[1]][$i], 'post', $nf); | |
} | |
$replace[$i] = $nf; | |
} | |
$content = str_replace($find, $replace, $content); | |
echo $content; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment