-
-
Save nilesolutions/070d94d19a6e02143c52c42d02d046b3 to your computer and use it in GitHub Desktop.
Парсер сайта на примере poverka-ndt.ru
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
print '<pre>'; | |
class parserXPath extends DOMXPath { | |
private function getPage($url){ | |
$ch = curl_init(); | |
curl_setopt ($ch , CURLOPT_URL , $url); | |
curl_setopt ($ch , CURLOPT_USERAGENT , "Mozilla/5.0"); | |
curl_setopt ($ch , CURLOPT_RETURNTRANSFER , 1 ); | |
// можете добавить кучу других опций | |
// http://php.net/curl_setopt | |
$data = curl_exec($ch); | |
curl_close($ch); | |
return $data; | |
} | |
public function __construct($url,$encode_to_utf8 = 0){ | |
$dom = new DOMDocument(); | |
if($encode_to_utf8){ | |
$dom->loadHTML(mb_convert_encoding($this->getPage($url), 'HTML-ENTITIES', 'UTF-8')); | |
} else { | |
$dom->loadHTML($this->getPage($url)); | |
} | |
$xp = parent::__construct($dom); | |
return $xp; | |
} | |
public function innerHTML($node) { | |
$doc = new DOMDocument(); | |
$doc->appendChild($doc->importNode($node,true)); | |
return $doc->saveHTML(); | |
} | |
public function childHTML($node,$exclude = array()) { | |
$html = array(); | |
$node->normalize(); | |
if($node->hasChildNodes()){ | |
foreach($node->childNodes as $n){ | |
if(in_array($n->tagName,$exclude)) continue; | |
if($n->nodeType!==3){ | |
$saved_attr = array('style','class','href','src','alt','title'); | |
$attr = array(); | |
foreach($saved_attr as $a){ | |
if($n->hasAttribute($a)) | |
$attr[] = $a.'="'.$n->getAttribute($a).'"'; | |
} | |
if(!empty($attr)) | |
$attr = ' '.implode(' ',$attr); | |
else $attr = ''; | |
} | |
if(!$n->hasChildNodes()){ | |
if($n->nodeType===1){ | |
$html[] = '<'.$n->tagName.$attr.' />'; | |
}else{ | |
$html[] = str_replace(array("\n"," "),'',$n->nodeValue); | |
} | |
}else{ | |
$html[] = "<".$n->tagName.$attr.">".$this->childHTML($n,$exclude)."</".$n->tagName.">"; | |
} | |
} | |
} | |
return implode("\n",$html); | |
} | |
public function childNode($node,$tagName,$num = 0,$depth = 9) { | |
$on = null; | |
$i = 0; foreach($node->childNodes as $n) { | |
if($n->tagName==$tagName) { | |
if($num==$i){ | |
$on = $n; | |
} | |
$i++; | |
} | |
} | |
if(!$on && $depth > 0){ | |
$on = $this->childNode($node,$tagName,$num,$depth--); | |
} | |
return $on; | |
} | |
// public function query($q,array $options = array()){ | |
// $nodes = parent::query($q); | |
// if($nodes->length <= 1) { | |
// // $nodes = $nodes->item(0); | |
// if($options) | |
// return $this->modify($nodes,$options); | |
// } | |
// return $nodes; | |
// } | |
private function modify($node,array $options = array()){ | |
switch($options['mode']){ | |
case 'html': | |
return $this->childHTML($node); | |
break; | |
case 'ihtml': | |
return $this->innerHTML($node); | |
break; | |
default: | |
} | |
} | |
} | |
$url = 'http://poverka-ndt.ru/'; | |
$xp = @new parserXPath($url); | |
foreach($xp->query('//*[@id="Mod125"]/div/div/div/ul/li') as $item){ | |
$tmp = [ | |
'pagetitle' => trim($xp->query($item->getNodePath() . '/a')->item(0)->textContent) | |
,'uri' => $xp->query($item->getNodePath() . '/a/@href')->item(0)->textContent | |
,'uri_override' => 1 | |
,'published' => 1 | |
,'createdon' => time() | |
,'template' => 2 | |
,'parent' => 85 | |
]; | |
$xp2 = @new parserXPath($url . ltrim($tmp['uri'],'/')); | |
$tmp['content'] = @$xp2->childHTML($xp2->query('//*[@id="t3-content"]/div[2]/div[@class="category_description"]')->item(0)); | |
if(!$ob = $modx->getObject('modResource',['pagetitle'=>$tmp['pagetitle']])){ | |
$ob = $modx->newObject('modResource'); | |
} | |
$ob->fromArray($tmp); | |
$ob->save(); | |
print $ob->id .'<br>'; | |
if($xp->query($item->getNodePath() . '/ul')->length){ | |
foreach($xp->query($item->getNodePath() . '/ul/li') as $it){ | |
$tmp = [ | |
'pagetitle' => trim($xp->query($it->getNodePath() . '/a')->item(0)->textContent) | |
,'uri' => $xp->query($it->getNodePath() . '/a/@href')->item(0)->textContent | |
,'uri_override' => 1 | |
,'published' => 1 | |
,'createdon' => time() | |
,'template' => 2 | |
,'parent' => $ob->id | |
]; | |
$xp2 = @new parserXPath($url . ltrim($tmp['uri'],'/')); | |
$tmp['content'] = @$xp2->childHTML($xp2->query('//*[@id="t3-content"]/div[2]/div[@class="category_description"]')->item(0)); | |
if(!$ob = $modx->getObject('modResource',['pagetitle'=>$tmp['pagetitle']])){ | |
$ob = $modx->newObject('modResource'); | |
} | |
$ob->fromArray($tmp); | |
$ob->save(); | |
print $ob->id .'<br>'; | |
// print_r($ob->toArray()); | |
// break; | |
} | |
} | |
// print_r($ob->toArray()); | |
// break; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
print '<pre>'; | |
class parserXPath extends DOMXPath { | |
private function getPage($url){ | |
$ch = curl_init(); | |
curl_setopt ($ch , CURLOPT_URL , $url); | |
curl_setopt ($ch , CURLOPT_USERAGENT , "Mozilla/5.0"); | |
curl_setopt ($ch , CURLOPT_RETURNTRANSFER , 1 ); | |
// можете добавить кучу других опций | |
// http://php.net/curl_setopt | |
$data = curl_exec($ch); | |
curl_close($ch); | |
return $data; | |
} | |
public function __construct($url,$encode_to_utf8 = 0){ | |
$dom = new DOMDocument(); | |
if($encode_to_utf8){ | |
$dom->loadHTML(mb_convert_encoding($this->getPage($url), 'HTML-ENTITIES', 'UTF-8')); | |
} else { | |
$dom->loadHTML($this->getPage($url)); | |
} | |
$xp = parent::__construct($dom); | |
return $xp; | |
} | |
public function innerHTML($node) { | |
$doc = new DOMDocument(); | |
$doc->appendChild($doc->importNode($node,true)); | |
return $doc->saveHTML(); | |
} | |
public function childHTML($node,$exclude = array()) { | |
// $exclude = array('script','style'); | |
$html = array(); | |
$node->normalize(); | |
if($node->hasChildNodes()){ | |
foreach($node->childNodes as $n){ | |
if(in_array($n->tagName,$exclude)) continue; | |
if($n->nodeType!==3){ | |
$saved_attr = array('style','class','href','src','alt','title'); | |
$attr = array(); | |
foreach($saved_attr as $a){ | |
if($n->hasAttribute($a)) | |
$attr[] = $a.'="'.$n->getAttribute($a).'"'; | |
} | |
if(!empty($attr)) | |
$attr = ' '.implode(' ',$attr); | |
else $attr = ''; | |
} | |
if(!$n->hasChildNodes()){ | |
if($n->nodeType===1){ | |
$html[] = '<'.$n->tagName.$attr.' />'; | |
}else{ | |
$html[] = str_replace(array("\n"," "),'',$n->nodeValue); | |
} | |
}else{ | |
$html[] = "<".$n->tagName.$attr.">".$this->childHTML($n,$exclude)."</".$n->tagName.">"; | |
} | |
} | |
} | |
return implode("\n",$html); | |
} | |
public function childNode($node,$tagName,$num = 0,$depth = 9) { | |
$on = null; | |
$i = 0; foreach($node->childNodes as $n) { | |
if($n->tagName==$tagName) { | |
if($num==$i){ | |
$on = $n; | |
} | |
$i++; | |
} | |
} | |
if(!$on && $depth > 0){ | |
$on = $this->childNode($node,$tagName,$num,$depth--); | |
} | |
return $on; | |
} | |
// public function query($q,array $options = array()){ | |
// $nodes = parent::query($q); | |
// if($nodes->length <= 1) { | |
// // $nodes = $nodes->item(0); | |
// if($options) | |
// return $this->modify($nodes,$options); | |
// } | |
// return $nodes; | |
// } | |
private function modify($node,array $options = array()){ | |
switch($options['mode']){ | |
case 'html': | |
return $this->childHTML($node); | |
break; | |
case 'ihtml': | |
return $this->innerHTML($node); | |
break; | |
default: | |
} | |
} | |
} | |
/** string dlFile($fileURL, $newDir, $params = array()) - скачивание файлов | |
* @param string $fileURL - урл скачиваемого файла | |
* @param string $newDir - директория, в которую нужно закачать файл | |
* @param array $params - массив параметров | |
* string $params['newName'] - новое имя файла (по умолчанию остается текущее) | |
* string $params['root'] - абсолютный путь к папке с сайтом, если не работает DOCUMENT_ROOT, нужен для cron | |
* boolean $params['curl'] - если false, cUrl не используется | |
* return $newPath - путь до скачанной картинки относительно сайта | |
**/ | |
function file_get_contents_curl($url,$ref=0,$retry=5,$delay=250000) { | |
$ch = curl_init(); | |
curl_setopt($ch, CURLOPT_HEADER, 0); | |
curl_setopt($ch, CURLOPT_USERAGENT, 'Opera 10.00'); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); | |
curl_setopt($ch, CURLOPT_URL, $url); | |
if($ref) curl_setopt($ch, CURLOPT_REFERER, $ref); | |
$data = curl_exec($ch); | |
$err = curl_errno($ch); | |
if($retry>0 && $err) { | |
usleep($delay); | |
return file_get_contents_curl($url,$ref,$retry-1); | |
} | |
curl_close($ch); | |
if($err) return false; | |
else return $data; | |
} | |
function dlFile($fileURL,$newDir,$params=array()){ | |
$root = $params['root']?$params['root']:$_SERVER['DOCUMENT_ROOT']; | |
if(!$newName = $params['newName']){ | |
$file = explode('/',$fileURL); $newName = $file[count($file)-1]; unset($file); | |
} | |
$newURL = "{$root}/{$newDir}"; | |
@mkdir("$newURL",0755,true); | |
$fp = fopen($newURL.$newName,'w'); | |
if($params['curl']===false) $content = file_get_contents($fileURL); | |
else $content = file_get_contents_curl($fileURL); | |
fwrite($fp, $content); | |
fclose($fp); | |
if($content) | |
return "{$newDir}{$newName}"; | |
else | |
return "Error downloading"; | |
} | |
$base_url = 'http://poverka-ndt.ru/'; | |
foreach($modx->getIterator('modResource',['template'=>2,'uri_override'=>1]) as $object){ | |
// print_r($object->pagetitle); | |
if($modx->getObject('modResource',$object->parent)->template == 2){ | |
$xp = @new parserXPath($base_url . ltrim($object->uri,'/') . '?limit=9999'); | |
foreach($xp->query('//*[@id="t3-content"]/div[2]/div[2]/div[2]/div') as $item){ | |
$url = $xp->query($item->getNodePath() . '//h2/a/@href')->item(0)->textContent; | |
if(!$ob = $modx->getObject('modResource', ['uri'=>$url])){ | |
$ob = $modx->newObject('modResource'); | |
$tmp = [ | |
'pagetitle' => $xp->query($item->getNodePath() . '//h2/a')->item(0)->textContent | |
,'uri' => $url | |
,'uri_override' => 1 | |
,'published' => 1 | |
,'createdon' => time() | |
,'template' => 3 | |
,'parent' => $object->id | |
]; | |
$xp2 = @new parserXPath($base_url . ltrim($tmp['uri'])); | |
$tmp['introtext'] = trim($xp2->query('//*[@id="t3-content"]/div[2]/div[1]/div[2]/div[1]')->item(0)->textContent); | |
$tmp['article'] = trim(str_replace('Артикул: ','',$xp2->query('//*[@id="t3-content"]/div[2]/div[1]/div[2]/div[2]/div[1]')->item(0)->textContent)); | |
$tmp['price'] = intval(preg_replace('/\D/i','',$xp2->query('//*[@class="PricesalesPrice"]')->item(0)->textContent)); | |
$tmp['content'] = @$xp2->innerHTML($xp2->query('//*[@id="home"]')->item(0)); | |
$tmp['description'] = $xp2->query('//meta[@name="description"]/@content')->item(0)->textContent; | |
$ob->fromArray($tmp); | |
$ob->save(); | |
$tvs = []; | |
$tvs['manufacturer'] = trim(str_replace('Производитель: ','',$xp2->query('//*[@id="t3-content"]/div[2]/div[1]/div[2]/div[2]/div[2]')->item(0)->textContent)); | |
if(!$o = $modx->getObject('modResource', ['pagetitle'=>$tvs['manufacturer']])){ | |
$o = $modx->newObject('modResource'); | |
$o->fromArray([ | |
'pagetitle' => $tvs['manufacturer'] | |
,'published' => 1 | |
,'createdon' => time() | |
,'template' => 1 | |
,'parent' => 194 | |
]); | |
$o->save(); | |
} | |
$tvs['manufacturer'] = $o->id; | |
$tvs['keywords'] = $xp2->query('//meta[@name="keywords"]/@content')->item(0)->textContent; | |
if($ob){ | |
$i = 0; foreach($xp2->query('//*[@id="t3-content"]/div[2]/div[1]/div[1]/div[2]/div[2]/div/div/a') as $img){ | |
$tvs['gallery'][] = [ | |
'MIGX_id' => $i | |
,'image' => str_replace('assets/images/','',@dlFile($base_url . trim($img->getAttribute('href'),'/'), 'assets/images/products/ank/'.$ob->id.'/')) | |
]; | |
$i++; } | |
$tvs['gallery'] = json_encode($tvs['gallery']); | |
} | |
foreach($tvs as $k=>$tv){ | |
$ob->setTVValue($k,$tv); | |
} | |
print $ob->id . '<br>'; | |
} | |
// print_r($ob->toArray()); | |
// break; | |
} | |
// print_r($object->pagetitle); | |
// break; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
$recs=$modx->getIterator('modResource',['template'=>3]); | |
// $recs=$modx->getIterator('modResource',['id'=>216]); | |
foreach($recs as $r){ | |
// $r->set('pagetitle', preg_replace('/^толщиномера/','Толщиномер',$r->get('pagetitle'))); | |
// $r->set('pagetitle', preg_replace('/^дефектоскопа/','Дефектоскоп',$r->get('pagetitle'))); | |
// $r->set('pagetitle', preg_replace('/^денситометра/','Денситометр',$r->get('pagetitle'))); | |
// $r->set('pagetitle', preg_replace('/^дальномера/','Дальномер',$r->get('pagetitle'))); | |
// $r->set('pagetitle', preg_replace('/^измерителя/','Измеритель',$r->get('pagetitle'))); | |
// $r->set('pagetitle', preg_replace('/^ферритометра/','Ферритометр',$r->get('pagetitle'))); | |
// $r->set('pagetitle', preg_replace('/^твердомера/','Твердомер',$r->get('pagetitle'))); | |
// $r->set('pagetitle', preg_replace('/^ультразвукового/','Ультразвуковой',$r->get('pagetitle'))); | |
// $r->set('pagetitle', preg_replace('/^стандартного образца/','Стандартный образец',$r->get('pagetitle'))); | |
// $r->set('pagetitle', preg_replace('/^толщиномеров/','Толщиномер',$r->get('pagetitle'))); | |
// $r->set('pagetitle', preg_replace('/^магнитного толщиномера/','Магнитный толщиномер',$r->get('pagetitle'))); | |
$r->set('pagetitle', preg_replace('/пьезоэлектрического/','пьезоэлектрический',$r->get('pagetitle'))); | |
$r->set('pagetitle', preg_replace('/преобразователя/','преобразователь',$r->get('pagetitle'))); | |
$r->set('pagetitle', preg_replace('/пьезоэлектрических преобразователей/','пьезоэлектрический преобразователь',$r->get('pagetitle'))); | |
$r->set('pagetitle', preg_replace('/эталонов/','эталон',$r->get('pagetitle'))); | |
$r->set('pagetitle', preg_replace('/^молотка/','Молоток',$r->get('pagetitle'))); | |
$r->set('pagetitle', preg_replace('/^образцов/','Образец',$r->get('pagetitle'))); | |
$r->set('pagetitle', preg_replace('/^профилометра/','Профилометр',$r->get('pagetitle'))); | |
$r->set('pagetitle', preg_replace('/^хордовых/','Хордовый',$r->get('pagetitle'))); | |
$r->save(); | |
print_r($r->get('pagetitle')); | |
print_r($r->get('id')); | |
print '<br>'; | |
} | |
print 'OK'; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment