nilesolutions · December 23, 2017 11:45
diff --git a/pars_category.php b/pars_category.php
 <?php
 print '<pre>';
 class parserXPath extends DOMXPath {

  private function getPage($url){
    $ch = curl_init(); 
    curl_setopt ($ch , CURLOPT_URL , $url);
    curl_setopt ($ch , CURLOPT_USERAGENT , "Mozilla/5.0");
    curl_setopt ($ch , CURLOPT_RETURNTRANSFER , 1 ); 
    // можете добавить кучу других опций
    // http://php.net/curl_setopt
    $data = curl_exec($ch);

    curl_close($ch);
    return $data;
  }

  public function __construct($url,$encode_to_utf8 = 0){
    $dom = new DOMDocument();
    if($encode_to_utf8){
      $dom->loadHTML(mb_convert_encoding($this->getPage($url), 'HTML-ENTITIES', 'UTF-8'));
    } else {
      $dom->loadHTML($this->getPage($url));
    }
    $xp = parent::__construct($dom);
    return $xp;
  }
  
  public function innerHTML($node) {
      $doc = new DOMDocument();     
      $doc->appendChild($doc->importNode($node,true));
      return $doc->saveHTML();
  }
    
  public function childHTML($node,$exclude = array()) {
      $html = array();
      $node->normalize();
      if($node->hasChildNodes()){
          foreach($node->childNodes as $n){
              if(in_array($n->tagName,$exclude)) continue;
              
              if($n->nodeType!==3){
                  $saved_attr = array('style','class','href','src','alt','title');
                  $attr = array();
                  foreach($saved_attr as $a){
                      if($n->hasAttribute($a))
                        $attr[] = $a.'="'.$n->getAttribute($a).'"';
                  }
                  if(!empty($attr)) 
                    $attr = ' '.implode(' ',$attr);
                  else $attr = '';
              }
                  
              if(!$n->hasChildNodes()){
                if($n->nodeType===1){
                  $html[] = '<'.$n->tagName.$attr.' />';
                }else{
                  $html[] = str_replace(array("\n","  "),'',$n->nodeValue);
                }
              }else{
                  $html[] = "<".$n->tagName.$attr.">".$this->childHTML($n,$exclude)."</".$n->tagName.">";
              }
          }
      }
      return implode("\n",$html);
  }

  public function childNode($node,$tagName,$num = 0,$depth = 9) {
  	  $on = null;
      $i = 0; foreach($node->childNodes as $n) {
          if($n->tagName==$tagName) {
          	  if($num==$i){
              	  $on = $n;
              }
          	  $i++;
          }
      }
      if(!$on && $depth > 0){
      	$on = $this->childNode($node,$tagName,$num,$depth--);
      }
      return $on;
  }
  
 //   public function query($q,array $options = array()){
 //       $nodes = parent::query($q);
 //       if($nodes->length <= 1) {
 //           // $nodes = $nodes->item(0);
 //           if($options)
 //             return $this->modify($nodes,$options);
 //       }
 //       return $nodes;
 //   }
  
  private function modify($node,array $options = array()){
      
      switch($options['mode']){
          case 'html':
              return $this->childHTML($node);
              break;
          case 'ihtml':
              return $this->innerHTML($node);
              break;
          default:
      }
  }
 }

 $url = 'http://poverka-ndt.ru/';
 $xp = @new parserXPath($url);

 foreach($xp->query('//*[@id="Mod125"]/div/div/div/ul/li') as $item){
    $tmp = [
        'pagetitle' => trim($xp->query($item->getNodePath() . '/a')->item(0)->textContent)
        ,'uri' => $xp->query($item->getNodePath() . '/a/@href')->item(0)->textContent
        ,'uri_override' => 1
        ,'published' => 1
        ,'createdon' => time()
        ,'template' => 2
        ,'parent' => 85
    ];
    
    $xp2 = @new parserXPath($url . ltrim($tmp['uri'],'/'));
    $tmp['content'] = @$xp2->childHTML($xp2->query('//*[@id="t3-content"]/div[2]/div[@class="category_description"]')->item(0));
    
    if(!$ob = $modx->getObject('modResource',['pagetitle'=>$tmp['pagetitle']])){
        $ob = $modx->newObject('modResource');
    }
    $ob->fromArray($tmp);
    $ob->save();
    
    print $ob->id .'<br>';
    
    if($xp->query($item->getNodePath() . '/ul')->length){
        foreach($xp->query($item->getNodePath() . '/ul/li') as $it){
            
            $tmp = [
                'pagetitle' => trim($xp->query($it->getNodePath() . '/a')->item(0)->textContent)
                ,'uri' => $xp->query($it->getNodePath() . '/a/@href')->item(0)->textContent
                ,'uri_override' => 1
                ,'published' => 1
                ,'createdon' => time()
                ,'template' => 2
                ,'parent' => $ob->id
            ];
            
            $xp2 = @new parserXPath($url . ltrim($tmp['uri'],'/'));
            $tmp['content'] = @$xp2->childHTML($xp2->query('//*[@id="t3-content"]/div[2]/div[@class="category_description"]')->item(0));
            
            if(!$ob = $modx->getObject('modResource',['pagetitle'=>$tmp['pagetitle']])){
                $ob = $modx->newObject('modResource');
            }
            $ob->fromArray($tmp);
            $ob->save();
            
            print $ob->id .'<br>';
            
    // print_r($ob->toArray());
    // break;
        }
    }
    
    // print_r($ob->toArray());
    // break;
 }
diff --git a/pars_products.php b/pars_products.php
 <?php
 print '<pre>';
 class parserXPath extends DOMXPath {

  private function getPage($url){
    $ch = curl_init(); 
    curl_setopt ($ch , CURLOPT_URL , $url);
    curl_setopt ($ch , CURLOPT_USERAGENT , "Mozilla/5.0");
    curl_setopt ($ch , CURLOPT_RETURNTRANSFER , 1 ); 
    // можете добавить кучу других опций
    // http://php.net/curl_setopt
    $data = curl_exec($ch);

    curl_close($ch);
    return $data;
  }

  public function __construct($url,$encode_to_utf8 = 0){
    $dom = new DOMDocument();
    if($encode_to_utf8){
      $dom->loadHTML(mb_convert_encoding($this->getPage($url), 'HTML-ENTITIES', 'UTF-8'));
    } else {
      $dom->loadHTML($this->getPage($url));
    }
    $xp = parent::__construct($dom);
    return $xp;
  }
  
  public function innerHTML($node) {
      $doc = new DOMDocument();     
      $doc->appendChild($doc->importNode($node,true));
      return $doc->saveHTML();
  }
    
  public function childHTML($node,$exclude = array()) {
    //   $exclude = array('script','style');
      $html = array();
      $node->normalize();
      if($node->hasChildNodes()){
          foreach($node->childNodes as $n){
              if(in_array($n->tagName,$exclude)) continue;
              
              if($n->nodeType!==3){
                  $saved_attr = array('style','class','href','src','alt','title');
                  $attr = array();
                  foreach($saved_attr as $a){
                      if($n->hasAttribute($a))
                        $attr[] = $a.'="'.$n->getAttribute($a).'"';
                  }
                  if(!empty($attr)) 
                    $attr = ' '.implode(' ',$attr);
                  else $attr = '';
              }
                  
              if(!$n->hasChildNodes()){
                if($n->nodeType===1){
                  $html[] = '<'.$n->tagName.$attr.' />';
                }else{
                  $html[] = str_replace(array("\n","  "),'',$n->nodeValue);
                }
              }else{
                  $html[] = "<".$n->tagName.$attr.">".$this->childHTML($n,$exclude)."</".$n->tagName.">";
              }
          }
      }
      return implode("\n",$html);
  }

  public function childNode($node,$tagName,$num = 0,$depth = 9) {
  	  $on = null;
      $i = 0; foreach($node->childNodes as $n) {
          if($n->tagName==$tagName) {
          	  if($num==$i){
              	  $on = $n;
              }
          	  $i++;
          }
      }
      if(!$on && $depth > 0){
      	$on = $this->childNode($node,$tagName,$num,$depth--);
      }
      return $on;
  }
  
 //   public function query($q,array $options = array()){
 //       $nodes = parent::query($q);
 //       if($nodes->length <= 1) {
 //           // $nodes = $nodes->item(0);
 //           if($options)
 //             return $this->modify($nodes,$options);
 //       }
 //       return $nodes;
 //   }
  
  private function modify($node,array $options = array()){
      
      switch($options['mode']){
          case 'html':
              return $this->childHTML($node);
              break;
          case 'ihtml':
              return $this->innerHTML($node);
              break;
          default:
      }
  }
 }
 /** string dlFile($fileURL, $newDir, $params = array()) - скачивание файлов
  * @param string $fileURL - урл скачиваемого файла
  * @param string $newDir - директория, в которую нужно закачать файл
  * @param array $params - массив параметров 
  *     string $params['newName'] - новое имя файла (по умолчанию остается текущее)
  *     string $params['root'] - абсолютный путь к папке с сайтом, если не работает DOCUMENT_ROOT, нужен для cron
  *     boolean $params['curl'] - если false, cUrl не используется
  * return $newPath - путь до скачанной картинки относительно сайта
 **/
 function file_get_contents_curl($url,$ref=0,$retry=5,$delay=250000) {
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_USERAGENT, 'Opera 10.00');
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 
    curl_setopt($ch, CURLOPT_URL, $url);
    if($ref) curl_setopt($ch, CURLOPT_REFERER, $ref);
    $data = curl_exec($ch);
    $err = curl_errno($ch);
    if($retry>0 && $err) {
        usleep($delay);
        return file_get_contents_curl($url,$ref,$retry-1);
    }
    curl_close($ch);
    if($err) return false;
    else return $data;
 }
 function dlFile($fileURL,$newDir,$params=array()){
    $root = $params['root']?$params['root']:$_SERVER['DOCUMENT_ROOT'];
    if(!$newName = $params['newName']){
        $file = explode('/',$fileURL); $newName = $file[count($file)-1]; unset($file);
    }
    $newURL = "{$root}/{$newDir}";
    @mkdir("$newURL",0755,true);
    $fp = fopen($newURL.$newName,'w');
    
    if($params['curl']===false) $content = file_get_contents($fileURL);
    else $content = file_get_contents_curl($fileURL);
    
    fwrite($fp, $content);
    fclose($fp);
    
    if($content)
        return "{$newDir}{$newName}";
    else
        return "Error downloading";
 }
 $base_url = 'http://poverka-ndt.ru/';

 foreach($modx->getIterator('modResource',['template'=>2,'uri_override'=>1]) as $object){
    
        // print_r($object->pagetitle);
    if($modx->getObject('modResource',$object->parent)->template == 2){
        $xp = @new parserXPath($base_url . ltrim($object->uri,'/') . '?limit=9999');
        
        foreach($xp->query('//*[@id="t3-content"]/div[2]/div[2]/div[2]/div') as $item){
            $url = $xp->query($item->getNodePath() . '//h2/a/@href')->item(0)->textContent;
            
            if(!$ob = $modx->getObject('modResource', ['uri'=>$url])){
                $ob = $modx->newObject('modResource');
                
                $tmp = [
                    'pagetitle' => $xp->query($item->getNodePath() . '//h2/a')->item(0)->textContent
                    ,'uri' => $url
                    ,'uri_override' => 1
                    ,'published' => 1
                    ,'createdon' => time()
                    ,'template' => 3
                    ,'parent' => $object->id
                ];
                
                $xp2 = @new parserXPath($base_url . ltrim($tmp['uri']));
                
                $tmp['introtext'] = trim($xp2->query('//*[@id="t3-content"]/div[2]/div[1]/div[2]/div[1]')->item(0)->textContent);
                $tmp['article'] = trim(str_replace('Артикул: ','',$xp2->query('//*[@id="t3-content"]/div[2]/div[1]/div[2]/div[2]/div[1]')->item(0)->textContent));
                
                $tmp['price'] = intval(preg_replace('/\D/i','',$xp2->query('//*[@class="PricesalesPrice"]')->item(0)->textContent));
                
                $tmp['content'] = @$xp2->innerHTML($xp2->query('//*[@id="home"]')->item(0));
                $tmp['description'] = $xp2->query('//meta[@name="description"]/@content')->item(0)->textContent;
                
                $ob->fromArray($tmp);
                $ob->save();
                
                $tvs = [];
                $tvs['manufacturer'] = trim(str_replace('Производитель: ','',$xp2->query('//*[@id="t3-content"]/div[2]/div[1]/div[2]/div[2]/div[2]')->item(0)->textContent));
                
                if(!$o = $modx->getObject('modResource', ['pagetitle'=>$tvs['manufacturer']])){
                    $o = $modx->newObject('modResource');
                    $o->fromArray([
                        'pagetitle' => $tvs['manufacturer']
                        ,'published' => 1
                        ,'createdon' => time()
                        ,'template' => 1
                        ,'parent' => 194
                    ]);
                    $o->save();
                }
                $tvs['manufacturer'] = $o->id;
                
                $tvs['keywords'] = $xp2->query('//meta[@name="keywords"]/@content')->item(0)->textContent;
                
                if($ob){
                    $i = 0; foreach($xp2->query('//*[@id="t3-content"]/div[2]/div[1]/div[1]/div[2]/div[2]/div/div/a') as $img){
                        $tvs['gallery'][] = [
                            'MIGX_id' => $i
                            ,'image' => str_replace('assets/images/','',@dlFile($base_url . trim($img->getAttribute('href'),'/'), 'assets/images/products/ank/'.$ob->id.'/'))
                        ];
                    $i++; }
                    $tvs['gallery'] = json_encode($tvs['gallery']);
                }
                
                foreach($tvs as $k=>$tv){
                    $ob->setTVValue($k,$tv);
                }
                
                print $ob->id . '<br>';
            }
            
            // print_r($ob->toArray());
            // break;
        }
        
        // print_r($object->pagetitle);
        // break;
    }
 }
diff --git a/replace_pagetitle.php b/replace_pagetitle.php
 <?php
 $recs=$modx->getIterator('modResource',['template'=>3]);
 // $recs=$modx->getIterator('modResource',['id'=>216]);
 foreach($recs as $r){
 // 	$r->set('pagetitle', preg_replace('/^толщиномера/','Толщиномер',$r->get('pagetitle')));
 // 	$r->set('pagetitle', preg_replace('/^дефектоскопа/','Дефектоскоп',$r->get('pagetitle')));
 // 	$r->set('pagetitle', preg_replace('/^денситометра/','Денситометр',$r->get('pagetitle')));
 // 	$r->set('pagetitle', preg_replace('/^дальномера/','Дальномер',$r->get('pagetitle')));
 // 	$r->set('pagetitle', preg_replace('/^измерителя/','Измеритель',$r->get('pagetitle')));
 // 	$r->set('pagetitle', preg_replace('/^ферритометра/','Ферритометр',$r->get('pagetitle')));
 // 	$r->set('pagetitle', preg_replace('/^твердомера/','Твердомер',$r->get('pagetitle')));
 // 	$r->set('pagetitle', preg_replace('/^ультразвукового/','Ультразвуковой',$r->get('pagetitle')));
 // 	$r->set('pagetitle', preg_replace('/^стандартного образца/','Стандартный образец',$r->get('pagetitle')));
 // 	$r->set('pagetitle', preg_replace('/^толщиномеров/','Толщиномер',$r->get('pagetitle')));
 // 	$r->set('pagetitle', preg_replace('/^магнитного толщиномера/','Магнитный толщиномер',$r->get('pagetitle')));
 	$r->set('pagetitle', preg_replace('/пьезоэлектрического/','пьезоэлектрический',$r->get('pagetitle')));
 	$r->set('pagetitle', preg_replace('/преобразователя/','преобразователь',$r->get('pagetitle')));
 	$r->set('pagetitle', preg_replace('/пьезоэлектрических преобразователей/','пьезоэлектрический преобразователь',$r->get('pagetitle')));
 	$r->set('pagetitle', preg_replace('/эталонов/','эталон',$r->get('pagetitle')));
 	$r->set('pagetitle', preg_replace('/^молотка/','Молоток',$r->get('pagetitle')));
 	$r->set('pagetitle', preg_replace('/^образцов/','Образец',$r->get('pagetitle')));
 	$r->set('pagetitle', preg_replace('/^профилометра/','Профилометр',$r->get('pagetitle')));
 	$r->set('pagetitle', preg_replace('/^хордовых/','Хордовый',$r->get('pagetitle')));
 	$r->save();
 	print_r($r->get('pagetitle'));
 	print_r($r->get('id'));
 	print '<br>';
 }
 print 'OK';
	<?php
	print '<pre>';
	class parserXPath extends DOMXPath {

	private function getPage($url){
	$ch = curl_init();
	curl_setopt ($ch , CURLOPT_URL , $url);
	curl_setopt ($ch , CURLOPT_USERAGENT , "Mozilla/5.0");
	curl_setopt ($ch , CURLOPT_RETURNTRANSFER , 1 );
	// можете добавить кучу других опций
	// http://php.net/curl_setopt
	$data = curl_exec($ch);

	curl_close($ch);
	return $data;
	}

	public function __construct($url,$encode_to_utf8 = 0){
	$dom = new DOMDocument();
	if($encode_to_utf8){
	$dom->loadHTML(mb_convert_encoding($this->getPage($url), 'HTML-ENTITIES', 'UTF-8'));
	} else {
	$dom->loadHTML($this->getPage($url));
	}
	$xp = parent::__construct($dom);
	return $xp;
	}

	public function innerHTML($node) {
	$doc = new DOMDocument();
	$doc->appendChild($doc->importNode($node,true));
	return $doc->saveHTML();
	}

	public function childHTML($node,$exclude = array()) {
	$html = array();
	$node->normalize();
	if($node->hasChildNodes()){
	foreach($node->childNodes as $n){
	if(in_array($n->tagName,$exclude)) continue;

	if($n->nodeType!==3){
	$saved_attr = array('style','class','href','src','alt','title');
	$attr = array();
	foreach($saved_attr as $a){
	if($n->hasAttribute($a))
	$attr[] = $a.'="'.$n->getAttribute($a).'"';
	}
	if(!empty($attr))
	$attr = ' '.implode(' ',$attr);
	else $attr = '';
	}

	if(!$n->hasChildNodes()){
	if($n->nodeType===1){
	$html[] = '<'.$n->tagName.$attr.' />';
	}else{
	$html[] = str_replace(array("\n"," "),'',$n->nodeValue);
	}
	}else{
	$html[] = "<".$n->tagName.$attr.">".$this->childHTML($n,$exclude)."</".$n->tagName.">";
	}
	}
	}
	return implode("\n",$html);
	}

	public function childNode($node,$tagName,$num = 0,$depth = 9) {
	$on = null;
	$i = 0; foreach($node->childNodes as $n) {
	if($n->tagName==$tagName) {
	if($num==$i){
	$on = $n;
	}
	$i++;
	}
	}
	if(!$on && $depth > 0){
	$on = $this->childNode($node,$tagName,$num,$depth--);
	}
	return $on;
	}

	// public function query($q,array $options = array()){
	// $nodes = parent::query($q);
	// if($nodes->length <= 1) {
	// // $nodes = $nodes->item(0);
	// if($options)
	// return $this->modify($nodes,$options);
	// }
	// return $nodes;
	// }

	private function modify($node,array $options = array()){

	switch($options['mode']){
	case 'html':
	return $this->childHTML($node);
	break;
	case 'ihtml':
	return $this->innerHTML($node);
	break;
	default:
	}
	}
	}

	$url = 'http://poverka-ndt.ru/';
	$xp = @new parserXPath($url);

	foreach($xp->query('//*[@id="Mod125"]/div/div/div/ul/li') as $item){
	$tmp = [
	'pagetitle' => trim($xp->query($item->getNodePath() . '/a')->item(0)->textContent)
	,'uri' => $xp->query($item->getNodePath() . '/a/@href')->item(0)->textContent
	,'uri_override' => 1
	,'published' => 1
	,'createdon' => time()
	,'template' => 2
	,'parent' => 85
	];

	$xp2 = @new parserXPath($url . ltrim($tmp['uri'],'/'));
	$tmp['content'] = @$xp2->childHTML($xp2->query('//*[@id="t3-content"]/div[2]/div[@class="category_description"]')->item(0));

	if(!$ob = $modx->getObject('modResource',['pagetitle'=>$tmp['pagetitle']])){
	$ob = $modx->newObject('modResource');
	}
	$ob->fromArray($tmp);
	$ob->save();

	print $ob->id .'<br>';

	if($xp->query($item->getNodePath() . '/ul')->length){
	foreach($xp->query($item->getNodePath() . '/ul/li') as $it){

	$tmp = [
	'pagetitle' => trim($xp->query($it->getNodePath() . '/a')->item(0)->textContent)
	,'uri' => $xp->query($it->getNodePath() . '/a/@href')->item(0)->textContent
	,'uri_override' => 1
	,'published' => 1
	,'createdon' => time()
	,'template' => 2
	,'parent' => $ob->id
	];

	$xp2 = @new parserXPath($url . ltrim($tmp['uri'],'/'));
	$tmp['content'] = @$xp2->childHTML($xp2->query('//*[@id="t3-content"]/div[2]/div[@class="category_description"]')->item(0));

	if(!$ob = $modx->getObject('modResource',['pagetitle'=>$tmp['pagetitle']])){
	$ob = $modx->newObject('modResource');
	}
	$ob->fromArray($tmp);
	$ob->save();

	print $ob->id .'<br>';

	// print_r($ob->toArray());
	// break;
	}
	}

	// print_r($ob->toArray());
	// break;
	}
	<?php
	$recs=$modx->getIterator('modResource',['template'=>3]);
	// $recs=$modx->getIterator('modResource',['id'=>216]);
	foreach($recs as $r){
	// $r->set('pagetitle', preg_replace('/^толщиномера/','Толщиномер',$r->get('pagetitle')));
	// $r->set('pagetitle', preg_replace('/^дефектоскопа/','Дефектоскоп',$r->get('pagetitle')));
	// $r->set('pagetitle', preg_replace('/^денситометра/','Денситометр',$r->get('pagetitle')));
	// $r->set('pagetitle', preg_replace('/^дальномера/','Дальномер',$r->get('pagetitle')));
	// $r->set('pagetitle', preg_replace('/^измерителя/','Измеритель',$r->get('pagetitle')));
	// $r->set('pagetitle', preg_replace('/^ферритометра/','Ферритометр',$r->get('pagetitle')));
	// $r->set('pagetitle', preg_replace('/^твердомера/','Твердомер',$r->get('pagetitle')));
	// $r->set('pagetitle', preg_replace('/^ультразвукового/','Ультразвуковой',$r->get('pagetitle')));
	// $r->set('pagetitle', preg_replace('/^стандартного образца/','Стандартный образец',$r->get('pagetitle')));
	// $r->set('pagetitle', preg_replace('/^толщиномеров/','Толщиномер',$r->get('pagetitle')));
	// $r->set('pagetitle', preg_replace('/^магнитного толщиномера/','Магнитный толщиномер',$r->get('pagetitle')));
	$r->set('pagetitle', preg_replace('/пьезоэлектрического/','пьезоэлектрический',$r->get('pagetitle')));
	$r->set('pagetitle', preg_replace('/преобразователя/','преобразователь',$r->get('pagetitle')));
	$r->set('pagetitle', preg_replace('/пьезоэлектрических преобразователей/','пьезоэлектрический преобразователь',$r->get('pagetitle')));
	$r->set('pagetitle', preg_replace('/эталонов/','эталон',$r->get('pagetitle')));
	$r->set('pagetitle', preg_replace('/^молотка/','Молоток',$r->get('pagetitle')));
	$r->set('pagetitle', preg_replace('/^образцов/','Образец',$r->get('pagetitle')));
	$r->set('pagetitle', preg_replace('/^профилометра/','Профилометр',$r->get('pagetitle')));
	$r->set('pagetitle', preg_replace('/^хордовых/','Хордовый',$r->get('pagetitle')));
	$r->save();
	print_r($r->get('pagetitle'));
	print_r($r->get('id'));
	print '<br>';
	}
	print 'OK';