Created
November 16, 2017 08:01
-
-
Save imdong/33598d94ec299da96b0e92839c94ef24 to your computer and use it in GitHub Desktop.
SiteTplDown / 网站扒模板工具
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* cURL网页抓取 | |
* | |
* @param string $_Get_Url 被访问Url | |
* @param string $_Method 访问方式 POST or GET | |
* @param string $_Form_Data POST 数据 | |
* @param string $_Cookie Cookies | |
* @param array $_Headers 头信息 | |
* @param array $_Proxy 代理地址 | |
* @param integer $_Time_Out 超时时间 | |
*/ | |
function Get_Web_Contents($_Get_Url, $_Method = "GET", $_Form_Data = "", $_Cookie = "", $_Headers = array(), $_Proxy = array("Proxy" => ""), $_Time_Out = 30){ | |
$ch = curl_init(); //创建cURL对象 | |
curl_setopt($ch, CURLOPT_URL, $_Get_Url); //设置读取URL | |
curl_setopt($ch, CURLOPT_HEADER, 1); //是否输出头信息,0为不输出,非零则输出 | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); //设置输出方式, 0为自动输出返回的内容, 1为返回输出的内容,但不自动输出. | |
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $_Time_Out); // 设置超时 30秒 | |
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); | |
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); | |
// 设置代理 | |
if(isset($_Proxy["Proxy"])){ | |
curl_setopt($ch, CURLOPT_PROXY, $_Proxy["Proxy"]); //设置代理地址 | |
if(isset($_Proxy["UserNmae"]) and isset($_Proxy["PassWord"])){ | |
curl_setopt($ch, CURLOPT_PROXYUSERPWD, $_Proxy["UserNmae"].":".$_Proxy["PassWord"]); // 设置代理用户名与密码 | |
} | |
} | |
// 设置 POST 数据 | |
if(strtoupper($_Method) == "POST"){ | |
curl_setopt($ch, CURLOPT_POST, 1); //设置为 POST 提交 | |
curl_setopt($ch, CURLOPT_POSTFIELDS, $_Form_Data); //设置POST数据 | |
} | |
// 设置 Cookies 数据 | |
if(strlen($_Cookie)){ | |
curl_setopt($ch, CURLOPT_COOKIE, $_Cookie); // 设置 Cookies | |
} | |
// 设置附加协议头 | |
if(isset($_Headers)){ | |
//设置 User-Agent | |
if(isset($_Headers['User-Agent'])){ | |
curl_setopt($ch, CURLOPT_USERAGENT, $_Headers['User-Agent']); | |
} | |
curl_setopt($ch, CURLOPT_HTTPHEADER, $_Headers); // 设置附加协议头 | |
} | |
@$html = curl_exec($ch); //执行 | |
if ($html === False) { //获取错误, | |
$ret["Error"] = curl_error($ch); | |
return $ret; | |
} | |
$ret["Info"] = curl_getinfo($ch); //获取详细信息 | |
// curl_close($ch);//关闭对象 | |
// 区分头信息与正文 | |
$_wz = strpos($html,"\r\n\r\n"); | |
$ret["Header"] = substr($html,0,$_wz); //截取头信息 | |
if($ret["Header"] == "HTTP/1.1 100 Continue"){ | |
$html = substr($html,$_wz+4); | |
$_wz = strpos($html, "\r\n\r\n"); | |
$ret["Header"] = substr($html,0,$_wz); //截取头信息 | |
} | |
// 获取Cookies 信息 | |
if(preg_match_all("/set-cookie:\s?(.*?=.*?);/i", $ret["Header"], $cookie)){ | |
$cookie = $cookie[1]; | |
} | |
$ret["Cookies"] = ""; | |
foreach ($cookie as $value){ | |
if(!is_array($value)){ | |
$ret["Cookies"].= $value."; "; | |
} | |
} | |
$ret["Cookies"] = substr($ret["Cookies"],0,-1); | |
$ret["Body"] = substr($html,$_wz+4); //获取正文 | |
return $ret; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* 功能方法 | |
*/ | |
require 'function.lib.php'; // 引用公共函数类库 | |
/** | |
* 判断环境类型编码输出 | |
* @param [type] $Str [description] | |
* @return [type] [description] | |
*/ | |
function echoE($Str){ | |
echo php_sapi_name() == 'cli' ? iconv('utf-8', 'gb2312', $Str) : $Str; | |
} | |
/** | |
* 解析指定文件中的url列表与关系 | |
* @param String $urlFileName Url列表文件路径 | |
* @return array 解析后的Url关系列表 | |
*/ | |
function expUrlList($urlFileName) | |
{ | |
// 获取所有网址列表 | |
$urlListStr = file_get_contents($urlFileName); | |
// 换行统一为 \n | |
$urlListStr = str_ireplace(array("\r\n","\r"), "\n", $urlListStr); | |
$urlList = explode("\n\n", $urlListStr); | |
// 重新组织列表 | |
$urlAll = array(); | |
// 循环每一个栏目 | |
foreach ($urlList as $urlStr) { | |
$urlInfoList = explode("\n", $urlStr); | |
// 循环出名词和每一条网址 | |
foreach ($urlInfoList as $urlKey => $urlInfoStr) { | |
if($urlKey == 0 ) { | |
$urlName = $urlInfoStr; | |
$urlAll[$urlName] = array(); | |
}else{ | |
$urlPath = explode(',', $urlInfoStr); | |
$urlAll[$urlName][$urlPath['0']] = $urlPath['1']; | |
} | |
} | |
} | |
return $urlAll; | |
} | |
/** | |
* 获取页面内容并提取相关地址 | |
* @param string $url Url地址 | |
* @return array 页面正文和相关地址 | |
*/ | |
function getHtmlSrc($url) | |
{ | |
var_dump($url); | |
// 首先 下载页面内容 | |
$pageInfo = Get_Web_Contents($url); | |
$pageBody = $pageInfo['Body']; | |
$pageBody.= '<link rel="stylesheet" type="text/css" href="css/fk_css/content0.css" /><script type="text/javascript" src="./common_head1.js"></script><script type="text/javascript" src="../common_head2.js"></script>'; | |
// 获取页面地址相关路径 | |
$pageUrlInfo = parse_url($url); // 解析Url地址 | |
!empty($pageUrlInfo['path']) ?: $pageUrlInfo['path'] = '/'; | |
$siteDomain = "{$pageUrlInfo['scheme']}://{$pageUrlInfo['host']}"; // 前缀域名根目录 | |
$siteRootPath = '/'; // 网站根目录 | |
$sitePresenPath = preg_replace('#\/[^\/]+$#', '/', $pageUrlInfo['path']); // 当前目录 | |
$siteParentPath = preg_replace('#\/[^\/]+\/$#', '/', $sitePresenPath); // 上级目录 | |
var_dump($pageUrlInfo, $siteDomain, $siteRootPath, $sitePresenPath, $siteParentPath); | |
echo "\n\n"; | |
// 提取出所有地址 | |
preg_match_all('#<(?<tag>link|script|img).*?(?:src|href)=[\'\"](?<url>[^\"\']+)[\"\'][^>]*>#', $pageBody, $pageSrcList); | |
// 整理类型 并获取完整路径 | |
$urlList = array('css' => array(), 'js' => array(), 'img' => array()); | |
foreach ($pageSrcList['tag'] as $key => $tagType) { | |
// 重定义连接类型 | |
$tagName = $tagType == 'link' ? 'css' : ($tagType == 'script' ? 'js' : $tagType); | |
// 重定义路径 | |
preg_match('#^(?<s>https?://|(\.{0,2}/)+)?(?<url>.*)#', $pageSrcList['url'][$key], $urlT); | |
// 组装路径 | |
if(empty($urlT['s'])){ | |
$urlInte = $siteDomain . $sitePresenPath . $urlT['url']; | |
}else{ | |
switch ($urlT['s']) { | |
case '/': | |
$urlInte = $siteDomain . $siteRootPath . $urlT['url']; | |
break; | |
case './': | |
$urlInte = $siteDomain . $sitePresenPath . $urlT['url']; | |
break; | |
case '../': | |
$urlInte = $siteDomain . $siteParentPath . $urlT['url']; | |
break; | |
default: | |
$urlInte = $pageSrcList['url'][$key]; | |
break; | |
} | |
} | |
// 保存这个路径 | |
$urlList[$tagName][] = array( | |
'str' => $pageSrcList['url'][$key], | |
'inte' => $urlInte, | |
); | |
} | |
return array( | |
'Body' => $pageBody, | |
'urls' => $urlList | |
); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* 模板下载功能 | |
* | |
* 指定要下载的模板列表 | |
*/ | |
require 'function.php'; // 引用方法 | |
// 获取列表文件名 并解析出列表 | |
$urlFileName = empty($_SERVER['argv']['1']) ? './url.txt' : $_SERVER['argv']['1']; | |
$siteList = expUrlList($urlFileName); // 解析列表文件 | |
/** | |
* 下载所有页面 | |
* 提取所有的 css js img 并整理地址后去重复 | |
* 下载js 和 css 文件 | |
* 提取css 文件中的图片地址 | |
* 下载所有图片 | |
* 替换页面中所有的地址 | |
* 替换css 中所有的地址 | |
* | |
*/ | |
// 循环每一个网站 | |
$newSiteInfo = array(); // 每个页面的内容和资源地址 | |
$newSiteUrls = array('css' => array(), 'js' => array(), 'img' => array()); // 汇总的资源列表 | |
foreach ($siteList as $siteName => $urlList) { | |
// 循环这个网站里面的每一个地址 | |
foreach ($urlList as $urlName => $url) { | |
// 下载页面内容 | |
$newSiteInfo[$siteName][$urlName] = getHtmlSrc($url); | |
// 整理所有的资源Url | |
foreach ($variable as $key => $value) { | |
# code... | |
} | |
} | |
} | |
var_dump($newSiteInfo); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
按照下面格式添加地址即可
网站一
首页,http://www.baidu.cn/
频道,http://www.baidu.cn/news/
内容,http://www.baidu.cn/news/1403.html
网站二
首页,http://www.qq.com/
频道,http://www.qq.com/guonei/wz_by/
频道2,http://www.qq.com/guonei/wz/
内容,http://www.qq.com/guonei/wz_jc/1536.html