Skip to content

Instantly share code, notes, and snippets.

@xfenix
Last active August 29, 2015 14:16
Show Gist options
  • Save xfenix/7e8f96d1d0ad9167083a to your computer and use it in GitHub Desktop.
Save xfenix/7e8f96d1d0ad9167083a to your computer and use it in GitHub Desktop.
<?php
libxml_use_internal_errors(true);
class Spider {
private $basicUrl = 'http://euroset.ru/';
private $restrictedSectionTitles = [];
const SECTION_MOBILE = 1;
const SECTION_PAD = 2;
const SECTION_MOBILE_TITLE = 'Мобильные телефоны';
const SECTION_PAD_TITLE = 'Планшеты, Ноутбуки';
const RESTRICTED_CLASS = 'section-title';
const PRODUCTLIST_XPATH = './div[2]/div/div[1]/ul[1]/li/div/ul[1]/li/a[1]';
const XML_PRICE_TAG = 'price';
const XML_LINK_TAG = 'link';
const XML_TITLE_TAG = 'model';
const XML_ROOT_TAG = 'rrpmon';
const DATE_FORMAT = 'Y-m-d H:i:s';
public function __construct() {
$this->restrictedSectionTitles = [
self::SECTION_MOBILE => self::SECTION_MOBILE_TITLE,
self::SECTION_PAD => self::SECTION_PAD_TITLE
];
}
private function getClassXpath($className) {
return '//*[contains(concat(" ", normalize-space(@class), " "), "' . $className . '")]';
}
private function domainize($url) {
return $this->basicUrl . ltrim($url, '/');
}
private function createXMLFromDict($data) {
// create dom and root
$dom = new DOMDocument('1.0', 'utf-8');
$root = $dom->createElement(self::XML_ROOT_TAG);
$root->setAttribute('shop', $this->basicUrl);
$root->setAttribute('date', date(self::DATE_FORMAT));
// create items
foreach($data as $element) {
$item = $dom->createElement('item');
foreach($element as $key => $value) {
// beautify data
$value = trim($value);
switch($key) {
case self::XML_LINK_TAG:
$value = $this->domainize($value);
break;
case self::XML_PRICE_TAG:
$value = floatval($value);
break;
}
$subItem = $dom->createElement($key, $value);
$item->appendChild($subItem);
}
$root->appendChild($item);
}
// append root to xml and return
$dom->appendChild($root);
return $dom->saveXML();
}
public function createXMLFromUrl($pageUrl) {
// create dom and xpath objects
$dom = new DOMDocument();
try {
$dom->loadHTMLFile($this->domainize($pageUrl));
$xpathDom = new DOMXPath($dom);
}
catch(Exception $e) {
return 'Sorry, raised exception: ' . $e->getMessage();
}
// find restricted sections
$elements = $xpathDom->query($this->getClassXpath(self::RESTRICTED_CLASS));
$findedSections = [];
$grabbedData = [];
foreach($elements as $i => $element) {
// go through restricted section titles and check
foreach($this->restrictedSectionTitles as $type => $title) {
if(strpos($element->nodeValue, $title) > -1) {
$findedSections[$type] = $element;
// find wrapper of title and product sections
$productParent = $element
->parentNode
->parentNode
->parentNode;
// find product list items (first a tag actually)
$productItems = $xpathDom->query(self::PRODUCTLIST_XPATH, $productParent);
foreach($productItems as $_ => $item) {
$grabbedData[] = [
self::XML_PRICE_TAG => $item->getAttribute('data-product-price'),
self::XML_LINK_TAG => $item->getAttribute('href'),
self::XML_TITLE_TAG => $item->getAttribute('title')
];
}
}
}
}
// hello, gc
unset($dom);
unset($xpathDom);
// prepare xml and return it
return $this->createXMLFromDict($grabbedData);
}
}
$spiderInstance = new Spider();
print_r($spiderInstance->createXMLFromUrl('/'));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment