Last active
August 29, 2015 14:16
-
-
Save xfenix/7e8f96d1d0ad9167083a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
libxml_use_internal_errors(true); | |
class Spider { | |
private $basicUrl = 'http://euroset.ru/'; | |
private $restrictedSectionTitles = []; | |
const SECTION_MOBILE = 1; | |
const SECTION_PAD = 2; | |
const SECTION_MOBILE_TITLE = 'Мобильные телефоны'; | |
const SECTION_PAD_TITLE = 'Планшеты, Ноутбуки'; | |
const RESTRICTED_CLASS = 'section-title'; | |
const PRODUCTLIST_XPATH = './div[2]/div/div[1]/ul[1]/li/div/ul[1]/li/a[1]'; | |
const XML_PRICE_TAG = 'price'; | |
const XML_LINK_TAG = 'link'; | |
const XML_TITLE_TAG = 'model'; | |
const XML_ROOT_TAG = 'rrpmon'; | |
const DATE_FORMAT = 'Y-m-d H:i:s'; | |
public function __construct() { | |
$this->restrictedSectionTitles = [ | |
self::SECTION_MOBILE => self::SECTION_MOBILE_TITLE, | |
self::SECTION_PAD => self::SECTION_PAD_TITLE | |
]; | |
} | |
private function getClassXpath($className) { | |
return '//*[contains(concat(" ", normalize-space(@class), " "), "' . $className . '")]'; | |
} | |
private function domainize($url) { | |
return $this->basicUrl . ltrim($url, '/'); | |
} | |
private function createXMLFromDict($data) { | |
// create dom and root | |
$dom = new DOMDocument('1.0', 'utf-8'); | |
$root = $dom->createElement(self::XML_ROOT_TAG); | |
$root->setAttribute('shop', $this->basicUrl); | |
$root->setAttribute('date', date(self::DATE_FORMAT)); | |
// create items | |
foreach($data as $element) { | |
$item = $dom->createElement('item'); | |
foreach($element as $key => $value) { | |
// beautify data | |
$value = trim($value); | |
switch($key) { | |
case self::XML_LINK_TAG: | |
$value = $this->domainize($value); | |
break; | |
case self::XML_PRICE_TAG: | |
$value = floatval($value); | |
break; | |
} | |
$subItem = $dom->createElement($key, $value); | |
$item->appendChild($subItem); | |
} | |
$root->appendChild($item); | |
} | |
// append root to xml and return | |
$dom->appendChild($root); | |
return $dom->saveXML(); | |
} | |
public function createXMLFromUrl($pageUrl) { | |
// create dom and xpath objects | |
$dom = new DOMDocument(); | |
try { | |
$dom->loadHTMLFile($this->domainize($pageUrl)); | |
$xpathDom = new DOMXPath($dom); | |
} | |
catch(Exception $e) { | |
return 'Sorry, raised exception: ' . $e->getMessage(); | |
} | |
// find restricted sections | |
$elements = $xpathDom->query($this->getClassXpath(self::RESTRICTED_CLASS)); | |
$findedSections = []; | |
$grabbedData = []; | |
foreach($elements as $i => $element) { | |
// go through restricted section titles and check | |
foreach($this->restrictedSectionTitles as $type => $title) { | |
if(strpos($element->nodeValue, $title) > -1) { | |
$findedSections[$type] = $element; | |
// find wrapper of title and product sections | |
$productParent = $element | |
->parentNode | |
->parentNode | |
->parentNode; | |
// find product list items (first a tag actually) | |
$productItems = $xpathDom->query(self::PRODUCTLIST_XPATH, $productParent); | |
foreach($productItems as $_ => $item) { | |
$grabbedData[] = [ | |
self::XML_PRICE_TAG => $item->getAttribute('data-product-price'), | |
self::XML_LINK_TAG => $item->getAttribute('href'), | |
self::XML_TITLE_TAG => $item->getAttribute('title') | |
]; | |
} | |
} | |
} | |
} | |
// hello, gc | |
unset($dom); | |
unset($xpathDom); | |
// prepare xml and return it | |
return $this->createXMLFromDict($grabbedData); | |
} | |
} | |
$spiderInstance = new Spider(); | |
print_r($spiderInstance->createXMLFromUrl('/')); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment