Created
April 24, 2014 15:14
-
-
Save thoslin/11258378 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* A simple PHP web scraper | |
* >> php -f scraper.php | |
*/ | |
//echo "Hello World!"; | |
/** | |
* Fire a get request | |
*/ | |
function request_url($url) { | |
$curl = curl_init(); | |
curl_setopt_array($curl, array( | |
CURLOPT_URL => $url, | |
CURLOPT_FOLLOWLOCATION => 1, | |
CURLOPT_RETURNTRANSFER => 1, | |
CURLOPT_USERAGENT => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10' | |
)); | |
$resp = curl_exec($curl); | |
curl_close($curl); | |
return $resp; | |
} | |
/** | |
* Construct URL from relative path | |
*/ | |
function build_url($url) { | |
$domain = "www.unite-students.com"; | |
if (strpos($url, $domain) === false) { | |
$url = $domain . $url; | |
} | |
return $url; | |
} | |
function extract_urls($url, $xpath) { | |
$links = array(); | |
$response = request_url($url); | |
$dom_document = new DOMDocument(); | |
$dom_document->loadHTML($response); | |
$dom_xpath = new DOMXpath($dom_document); | |
$elements = $dom_xpath->query($xpath); | |
foreach($elements as $element) { | |
array_push($links, build_url($element->nodeValue)); | |
array_unique($links); | |
} | |
return $links; | |
} | |
function extract_data($dom_xpath, $query) { | |
$data = array(); | |
$nodes = $dom_xpath->query($query); | |
foreach($nodes as $node) { | |
array_push($data, $node->data); | |
} | |
if(count($data) == 1) { | |
return $data[0]; | |
} | |
return $data; | |
} | |
/** | |
* Parse information from a detail page | |
*/ | |
function parse_item($url) { | |
$response = request_url($url); | |
$dom_document = new DOMDocument(); | |
$dom_document->loadHTML($response); | |
$dom_xpath = new DOMXpath($dom_document); | |
$listing = array( | |
'url' => $url, | |
'name' => extract_data($dom_xpath, "//span[@itemprop='name']/text()"), | |
'price' => extract_data($dom_xpath, "//div[@data-tab='overview']/h2/span/text()"), | |
'address' => extract_data($dom_xpath, "//span[@itemprop='streetAddress']/text()") | |
); | |
$room_nodes = $dom_xpath->query("//h2[@class='property-room-header']/text()"); | |
$rooms = array(); | |
foreach ($room_nodes as $room_node) { | |
array_push($rooms, array( | |
'name' => $room_node->data, | |
'price' => $room_node->nextSibling->nodeValue | |
)); | |
} | |
$listing['rooms'] = $rooms; | |
return $listing; | |
} | |
function main() { | |
// Suppress warnings | |
error_reporting(E_ERROR | E_PARSE); | |
$start_url = "http://www.unite-students.com/London"; | |
$listings = array(); | |
echo "Start from here: {$start_url}\n\n"; | |
// List page URLS are in a special form like "/pp\d+/City" | |
$list_page_urls = extract_urls($start_url, | |
"//a[contains(@href, 'pp') and contains(text(), 'View all properties')]/@href"); | |
// Extract all detail page links | |
foreach ($list_page_urls as $list_page_url) { | |
$response = request_url($list_page_url); | |
preg_match_all('/.*(\/london\/[\w-]+)/', $response, $matches); | |
echo "Matched URLs:\n"; | |
print_r($matches[1]); | |
if ($matches) { | |
// Scrape useful contents from all detail pages | |
foreach ($matches[1] as $match) { | |
echo "\n\nCrawling URL " . $match . " ...\n"; | |
try { | |
$item = parse_item(build_url($match)); | |
array_push($listings, $item); | |
print_r($item); | |
} catch (Exception $e) { | |
echo 'Caught exception: ', $e->getMessage(), "\n"; | |
} | |
// Take a break. | |
sleep(10); | |
} | |
} | |
} | |
// Save it to json file | |
$filename = __DIR__ . "/" . date("Y-m-d H:i:s") . '.json'; | |
$file = fopen($filename, 'w'); | |
fwrite($file, json_encode($listings)); | |
fclose($file); | |
echo "Done! Results saved to {$filename}\n"; | |
} | |
main(); | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment