Created
June 18, 2017 19:02
-
-
Save mustafat0k/9724c09346dee7f15013720d35ad887a to your computer and use it in GitHub Desktop.
php DOMDocument::loadHTML fonksiyonları
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* | |
* howCode Web Crawler Tutorial Series Source Code | |
* Copyright (C) 2016 | |
* | |
* This program is free software: you can redistribute it and/or modify | |
* it under the terms of the GNU General Public License as published by | |
* the Free Software Foundation, either version 3 of the License, or | |
* (at your option) any later version. | |
* | |
* This program is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
* GNU General Public License for more details. | |
* | |
* You should have received a copy of the GNU General Public License | |
* along with this program. If not, see <http://www.gnu.org/licenses/>. | |
* | |
* https://howcode.org | |
* | |
*/ | |
// This is our starting point. Change this to whatever URL you want. | |
$start = ""; | |
// Our 2 global arrays containing our links to be crawled. | |
$already_crawled = array(); | |
$crawling = array(); | |
function get_details($url) { | |
// The array that we pass to stream_context_create() to modify our User Agent. | |
$options = array('http'=>array('method'=>"GET", 'headers'=>"User-Agent: howBot/0.1\n")); | |
// Create the stream context. | |
$context = stream_context_create($options); | |
// Create a new instance of PHP's DOMDocument class. | |
$doc = new DOMDocument(); | |
// Use file_get_contents() to download the page, pass the output of file_get_contents() | |
// to PHP's DOMDocument class. | |
@$doc->loadHTML(@file_get_contents($url, false, $context)); | |
// Create an array of all of the title tags. | |
$title = $doc->getElementsByTagName("title"); | |
// There should only be one <title> on each page, so our array should have only 1 element. | |
$title = $title->item(0)->nodeValue; | |
// Give $description and $keywords no value initially. We do this to prevent errors. | |
$description = ""; | |
$keywords = ""; | |
// Create an array of all of the pages <meta> tags. There will probably be lots of these. | |
$metas = $doc->getElementsByTagName("meta"); | |
// Loop through all of the <meta> tags we find. | |
for ($i = 0; $i < $metas->length; $i++) { | |
$meta = $metas->item($i); | |
// Get the description and the keywords. | |
if (strtolower($meta->getAttribute("name")) == "description") | |
$description = $meta->getAttribute("content"); | |
if (strtolower($meta->getAttribute("name")) == "keywords") | |
$keywords = $meta->getAttribute("content"); | |
} | |
// Return our JSON string containing the title, description, keywords and URL. | |
return '{ "Title": "'.str_replace("\n", "", $title).'", "Description": "'.str_replace("\n", "", $description).'", "Keywords": "'.str_replace("\n", "", $keywords).'", "URL": "'.$url.'"},'; | |
} | |
function follow_links($url) { | |
// Give our function access to our crawl arrays. | |
global $already_crawled; | |
global $crawling; | |
// The array that we pass to stream_context_create() to modify our User Agent. | |
$options = array('http'=>array('method'=>"GET", 'headers'=>"User-Agent: howBot/0.1\n")); | |
// Create the stream context. | |
$context = stream_context_create($options); | |
// Create a new instance of PHP's DOMDocument class. | |
$doc = new DOMDocument(); | |
// Use file_get_contents() to download the page, pass the output of file_get_contents() | |
// to PHP's DOMDocument class. | |
@$doc->loadHTML(@file_get_contents($url, false, $context)); | |
// Create an array of all of the links we find on the page. | |
$linklist = $doc->getElementsByTagName("a"); | |
// Loop through all of the links we find. | |
foreach ($linklist as $link) { | |
$l = $link->getAttribute("href"); | |
// Process all of the links we find. This is covered in part 2 and part 3 of the video series. | |
if (substr($l, 0, 1) == "/" && substr($l, 0, 2) != "//") { | |
$l = parse_url($url)["scheme"]."://".parse_url($url)["host"].$l; | |
} else if (substr($l, 0, 2) == "//") { | |
$l = parse_url($url)["scheme"].":".$l; | |
} else if (substr($l, 0, 2) == "./") { | |
$l = parse_url($url)["scheme"]."://".parse_url($url)["host"].dirname(parse_url($url)["path"]).substr($l, 1); | |
} else if (substr($l, 0, 1) == "#") { | |
$l = parse_url($url)["scheme"]."://".parse_url($url)["host"].parse_url($url)["path"].$l; | |
} else if (substr($l, 0, 3) == "../") { | |
$l = parse_url($url)["scheme"]."://".parse_url($url)["host"]."/".$l; | |
} else if (substr($l, 0, 11) == "javascript:") { | |
continue; | |
} else if (substr($l, 0, 5) != "https" && substr($l, 0, 4) != "http") { | |
$l = parse_url($url)["scheme"]."://".parse_url($url)["host"]."/".$l; | |
} | |
// If the link isn't already in our crawl array add it, otherwise ignore it. | |
if (!in_array($l, $already_crawled)) { | |
$already_crawled[] = $l; | |
$crawling[] = $l; | |
// Output the page title, descriptions, keywords and URL. This output is | |
// piped off to an external file using the command line. | |
echo get_details($l)."\n"; | |
} | |
} | |
// Remove an item from the array after we have crawled it. | |
// This prevents infinitely crawling the same page. | |
array_shift($crawling); | |
// Follow each link in the crawling array. | |
foreach ($crawling as $site) { | |
follow_links($site); | |
} | |
} | |
// Begin the crawling process by crawling the starting link first. | |
follow_links($start); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment